Showing preview only (6,365K chars total). Download the full file or copy to clipboard to get everything.
Repository: cstub/ml-ids
Branch: master
Commit: b0e0b117adf6
Files: 219
Total size: 6.0 MB
Directory structure:
gitextract_m34ow7k6/
├── .dockerignore
├── .gitattributes
├── .github/
│ └── workflows/
│ ├── build.yml
│ ├── deployment.yml
│ └── train.yml
├── .gitignore
├── .idea/
│ ├── .gitignore
│ ├── deployment.xml
│ ├── inspectionProfiles/
│ │ └── profiles_settings.xml
│ ├── misc.xml
│ ├── ml-ids.iml
│ ├── modules.xml
│ └── vcs.xml
├── .pylintrc
├── Makefile
├── README.md
├── data/
│ ├── README.md
│ └── Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv
├── environment-notebook.yaml
├── environment.yaml
├── ml_ids/
│ ├── __init__.py
│ ├── conf.py
│ ├── data/
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ ├── metadata.py
│ │ └── split_dataset.py
│ ├── keras/
│ │ ├── __init__.py
│ │ ├── callbacks.py
│ │ ├── evaluation.py
│ │ ├── metrics.py
│ │ ├── model_selection.py
│ │ └── prediction.py
│ ├── libs/
│ │ └── dfencoder/
│ │ └── dataframe.py
│ ├── model_selection.py
│ ├── models/
│ │ ├── __init__.py
│ │ └── gradient_boost/
│ │ ├── __init__.py
│ │ ├── mlflow_wrapper.py
│ │ └── train.py
│ ├── prediction.py
│ ├── tf_utils.py
│ ├── transform/
│ │ ├── __init__.py
│ │ ├── preprocessing.py
│ │ └── sampling.py
│ └── visualization.py
├── models/
│ └── gradient_boost/
│ ├── envs/
│ │ ├── local/
│ │ │ └── train.py
│ │ └── sagemaker/
│ │ ├── configs/
│ │ │ ├── deploy.json
│ │ │ ├── train-cpu.json
│ │ │ └── train-gpu.json
│ │ ├── container/
│ │ │ ├── Dockerfile
│ │ │ └── train.py
│ │ └── scripts/
│ │ ├── build_image.sh
│ │ ├── deploy.py
│ │ ├── push_image_to_ecr.sh
│ │ ├── train.py
│ │ └── undeploy.py
│ ├── project/
│ │ ├── MLproject
│ │ ├── conda.yaml
│ │ └── train.py
│ ├── training_params.json
│ └── training_params_quick_run.json
├── notebooks/
│ ├── 01_data-cleanup/
│ │ └── data_cleanup.ipynb
│ ├── 02_exploratory-data-analysis/
│ │ └── exploratory_data_analysis.ipynb
│ ├── 03_ml-prototype/
│ │ ├── ml-prototype.ipynb
│ │ └── models/
│ │ └── gradient_boost_model.cbm
│ ├── 04_ml-prototype-spark/
│ │ ├── ml-prototype-spark.ipynb
│ │ └── models/
│ │ ├── gb-model/
│ │ │ ├── bestModel/
│ │ │ │ ├── data/
│ │ │ │ │ ├── ._SUCCESS.crc
│ │ │ │ │ ├── .part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── .part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
│ │ │ │ │ ├── _SUCCESS
│ │ │ │ │ ├── part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ ├── part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ │ └── part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
│ │ │ │ ├── metadata/
│ │ │ │ │ ├── ._SUCCESS.crc
│ │ │ │ │ ├── .part-00000.crc
│ │ │ │ │ ├── _SUCCESS
│ │ │ │ │ └── part-00000
│ │ │ │ └── treesMetadata/
│ │ │ │ ├── ._SUCCESS.crc
│ │ │ │ ├── .part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── .part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
│ │ │ │ ├── _SUCCESS
│ │ │ │ ├── part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ ├── part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ │ └── part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
│ │ │ ├── estimator/
│ │ │ │ └── metadata/
│ │ │ │ ├── ._SUCCESS.crc
│ │ │ │ ├── .part-00000.crc
│ │ │ │ ├── _SUCCESS
│ │ │ │ └── part-00000
│ │ │ ├── evaluator/
│ │ │ │ └── metadata/
│ │ │ │ ├── ._SUCCESS.crc
│ │ │ │ ├── .part-00000.crc
│ │ │ │ ├── _SUCCESS
│ │ │ │ └── part-00000
│ │ │ └── metadata/
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-00000.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000
│ │ └── pipeline-model/
│ │ ├── metadata/
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-00000.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000
│ │ └── stages/
│ │ ├── 0_ValueCleaner_57f061a9e393/
│ │ │ └── metadata/
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-00000.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000
│ │ ├── 1_Imputer_3f8cf4b571a8/
│ │ │ ├── data/
│ │ │ │ ├── ._SUCCESS.crc
│ │ │ │ ├── .part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet.crc
│ │ │ │ ├── _SUCCESS
│ │ │ │ └── part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet
│ │ │ └── metadata/
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-00000.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000
│ │ ├── 2_OneHotEncoderEstimator_f1dc6e50f52e/
│ │ │ ├── data/
│ │ │ │ ├── ._SUCCESS.crc
│ │ │ │ ├── .part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet.crc
│ │ │ │ ├── _SUCCESS
│ │ │ │ └── part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet
│ │ │ └── metadata/
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-00000.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000
│ │ ├── 3_VectorAssembler_ef6b7bf933ee/
│ │ │ └── metadata/
│ │ │ ├── ._SUCCESS.crc
│ │ │ ├── .part-00000.crc
│ │ │ ├── _SUCCESS
│ │ │ └── part-00000
│ │ └── 4_BinaryLabelMaker_3b174e5e0c29/
│ │ └── metadata/
│ │ ├── ._SUCCESS.crc
│ │ ├── .part-00000.crc
│ │ ├── _SUCCESS
│ │ └── part-00000
│ ├── 05_anomaly_detection/
│ │ ├── dl-anomaly-detection.ipynb
│ │ ├── models/
│ │ │ ├── denoising_autoencoder_model.h5
│ │ │ ├── simple_autoencoder_model.h5
│ │ │ └── stacked_autoencoder_model.h5
│ │ └── notebook_utils.py
│ ├── 06_dl_classifier/
│ │ ├── dl-classifier.ipynb
│ │ ├── models/
│ │ │ ├── c0cb0656-558f-4311-b138-9b91ab4d1fe6.h5
│ │ │ ├── model_class_weight.h5
│ │ │ ├── model_no_class_weights.h5
│ │ │ └── opt_model.h5
│ │ └── notebook_utils.py
│ └── 07_binary_classifier_comparison/
│ ├── binary-classifier-comparison.ipynb
│ ├── models/
│ │ └── gb_835066e8-2427-48ca-a521-67195008cb91.catboost
│ └── notebook_utils.py
├── setup.cfg
├── setup.py
├── tests/
│ ├── data/
│ │ └── test_dataset.py
│ ├── transform/
│ │ └── test_preprocessing.py
│ └── validation_data/
│ └── validation.csv
└── upload.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
build/
data/
notebooks/
tests/
dataset/
================================================
FILE: .gitattributes
================================================
*.csv filter=lfs diff=lfs merge=lfs -text
tests/validation_data/*.csv -filter=lfs -diff=lfs -merge=lfs -text
*.catboost filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.cbm filter=lfs diff=lfs merge=lfs -text
================================================
FILE: .github/workflows/build.yml
================================================
name: Build
on: [push]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- name: Install dependencies
run: |
conda env create --file environment.yaml
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate ml-ids
pip install -e .
- name: Static Type Check
run: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate ml-ids
make typecheck
- name: Code Quality Check
run: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate ml-ids
make lint-errors
- name: Test with pytest
run: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate ml-ids
make test
================================================
FILE: .github/workflows/deployment.yml
================================================
name: Deploy Model on AWS Sagemaker
on:
deployment
jobs:
deploy:
name: Deploy
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Set Tag in Environment
id: set-aws-tag
run: |
if [ -z "$GITHUB_REF" ]
then
echo "No Tag given. Workflow may only be run on tagged commits."
exit 1
fi
echo "::set-output name=awstag::$(echo ${GITHUB_REF:10} | sed 's/[^a-zA-Z0-9]/-/g')"
- name: Set up Python 3.7
uses: actions/setup-python@v1
with:
python-version: 3.7
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install click==7.0
pip install boto3==1.10.28
pip install mlflow==1.4.0
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
- name: Deploy model on AWS Sagemaker
id: deploy-model
env:
AWS_TAG: ${{ steps.set-aws-tag.outputs.awstag }}
run: |
make sagemaker_deploy JOB_ID="ml-ids-sagemaker-$AWS_TAG"
================================================
FILE: .github/workflows/train.yml
================================================
name: Train Model on AWS Sagemaker
on:
push:
tags:
- 'm*'
jobs:
train:
name: Deploy
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Set Tag in Environment
id: set-aws-tag
run: |
if [ -z "$GITHUB_REF" ]
then
echo "No Tag given. Workflow may only be run on tagged commits."
exit 1
fi
echo "::set-output name=awstag::$(echo ${GITHUB_REF:10} | sed 's/[^a-zA-Z0-9]/-/g')"
- name: Set up Python 3.7
uses: actions/setup-python@v1
with:
python-version: 3.7
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install click==7.0
pip install pandas==0.25.2
pip install sagemaker==1.44.3
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-1
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1
- name: Build, tag, and push image to Amazon ECR
id: build-image
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: ml-ids-train-sagemaker
IMAGE_TAG: ${{ github.sha }}
AWS_TAG: ${{ steps.set-aws-tag.outputs.awstag }}
run: |
docker build -f models/gradient_boost/envs/sagemaker/container/Dockerfile -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:$AWS_TAG
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$AWS_TAG
echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
- name: Train the packaged model on AWS Sagemaker
id: train-model
env:
IMAGE_NAME: ${{ steps.build-image.outputs.image }}
AWS_TAG: ${{ steps.set-aws-tag.outputs.awstag }}
run: |
make sagemaker_train_aws \
SAGEMAKER_TRAIN_CONFIG_PATH=models/gradient_boost/envs/sagemaker/configs/train-gpu.json \
SAGEMAKER_IMAGE_NAME=$IMAGE_NAME \
TRAIN_PARAM_PATH=models/gradient_boost/training_params.json \
JOB_ID="ml-ids-sagemaker-$AWS_TAG"
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
dataset/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# Catboost
catboost_info
================================================
FILE: .idea/.gitignore
================================================
# Default ignored files
/workspace.xml
================================================
FILE: .idea/deployment.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" autoUpload="Always" serverName="glados@192.168.1.77:22">
<serverData>
<paths name="glados@192.168.1.77:22">
<serverdata>
<mappings>
<mapping deploy="/home/glados/Development/Projects/ml-ids-remote" local="$PROJECT_DIR$" />
</mappings>
</serverdata>
</paths>
</serverData>
<option name="myAutoUpload" value="ALWAYS" />
</component>
</project>
================================================
FILE: .idea/inspectionProfiles/profiles_settings.xml
================================================
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
================================================
FILE: .idea/misc.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.7.3 (sftp://glados@192.168.1.77:22/home/glados/anaconda3/envs/tf2/bin/python)" project-jdk-type="Python SDK" />
</project>
================================================
FILE: .idea/ml-ids.iml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/ml_ids" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Remote Python 3.7.3 (sftp://glados@192.168.1.77:22/home/glados/anaconda3/envs/tf2/bin/python)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PackageRequirementsSettings">
<option name="requirementsPath" value="" />
</component>
<component name="TestRunnerService">
<option name="projectConfiguration" value="pytest" />
<option name="PROJECT_TEST_RUNNER" value="pytest" />
</component>
</module>
================================================
FILE: .idea/modules.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/ml-ids.iml" filepath="$PROJECT_DIR$/.idea/ml-ids.iml" />
</modules>
</component>
</project>
================================================
FILE: .idea/vcs.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
================================================
FILE: .pylintrc
================================================
[MASTER]
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-whitelist=
# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS
# Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths.
ignore-patterns=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
# number of processors available to use.
jobs=1
# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100
# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=
# Pickle collected data for later comparisons.
persistent=yes
# Specify a configuration file.
#rcfile=
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
confidence=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=print-statement,
parameter-unpacking,
unpacking-in-except,
old-raise-syntax,
backtick,
long-suffix,
old-ne-operator,
old-octal-literal,
import-star-module-level,
non-ascii-bytes-literal,
raw-checker-failed,
bad-inline-option,
locally-disabled,
file-ignored,
suppressed-message,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
apply-builtin,
basestring-builtin,
buffer-builtin,
cmp-builtin,
coerce-builtin,
execfile-builtin,
file-builtin,
long-builtin,
raw_input-builtin,
reduce-builtin,
standarderror-builtin,
unicode-builtin,
xrange-builtin,
coerce-method,
delslice-method,
getslice-method,
setslice-method,
no-absolute-import,
old-division,
dict-iter-method,
dict-view-method,
next-method-called,
metaclass-assignment,
indexing-exception,
raising-string,
reload-builtin,
oct-method,
hex-method,
nonzero-method,
cmp-method,
input-builtin,
round-builtin,
intern-builtin,
unichr-builtin,
map-builtin-not-iterating,
zip-builtin-not-iterating,
range-builtin-not-iterating,
filter-builtin-not-iterating,
using-cmp-argument,
eq-without-hash,
div-method,
idiv-method,
rdiv-method,
exception-message-attribute,
invalid-str-codec,
sys-max-int,
bad-python3-import,
deprecated-string-function,
deprecated-str-translate-call,
deprecated-itertools-function,
deprecated-types-field,
next-method-defined,
dict-items-not-iterating,
dict-keys-not-iterating,
dict-values-not-iterating,
deprecated-operator-function,
deprecated-urllib-function,
xreadlines-attribute,
deprecated-sys-function,
exception-escape,
comprehension-escape,
R0903,
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=c-extension-no-member
[REPORTS]
# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'error', 'warning', 'refactor', and 'convention'
# which contain the number of messages in each category, as well as 'statement'
# which is the total number of statements analyzed. This score is used by the
# global evaluation report (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details.
#msg-template=
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
output-format=text
# Tells whether to display a full report or only the messages.
reports=no
# Activate the evaluation score.
score=yes
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
# Complete name of functions that never returns. When checking for
# inconsistent-return-statements if a never returning function is called then
# it will be considered as an explicit return statement and no message will be
# printed.
never-returning-functions=sys.exit
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=Blues
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
# List of decorators that change the signature of a decorated function.
signature-mutators=
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,
XXX,
TODO
[LOGGING]
# Format style used to check logging format string. `old` means using %
# formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
logging-format-style=old
# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging
[STRING]
# This flag controls whether the implicit-str-concat-in-sequence should
# generate a warning on implicit string concatenation in sequences defined over
# several lines.
check-str-concat-over-line-jumps=no
[BASIC]
# Naming style matching correct argument names.
argument-naming-style=snake_case
# Regular expression matching correct argument names. Overrides argument-
# naming-style.
#argument-rgx=
# Naming style matching correct attribute names.
attr-naming-style=snake_case
# Regular expression matching correct attribute names. Overrides attr-naming-
# style.
#attr-rgx=
# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
bar,
baz,
toto,
tutu,
tata
# Naming style matching correct class attribute names.
class-attribute-naming-style=any
# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style.
#class-attribute-rgx=
# Naming style matching correct class names.
class-naming-style=PascalCase
# Regular expression matching correct class names. Overrides class-naming-
# style.
#class-rgx=
# Naming style matching correct constant names.
const-naming-style=UPPER_CASE
# Regular expression matching correct constant names. Overrides const-naming-
# style.
#const-rgx=
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming style matching correct function names.
function-naming-style=snake_case
# Regular expression matching correct function names. Overrides function-
# naming-style.
#function-rgx=
# Good variable names which should always be accepted, separated by a comma.
good-names=i,
j,
k,
f,
ex,
df,
X,
X_train,
X_val,
X_test,
y,
Run,
_
# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
# Naming style matching correct inline iteration names.
inlinevar-naming-style=any
# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style.
#inlinevar-rgx=
# Naming style matching correct method names.
method-naming-style=snake_case
# Regular expression matching correct method names. Overrides method-naming-
# style.
#method-rgx=
# Naming style matching correct module names.
module-naming-style=snake_case
# Regular expression matching correct module names. Overrides module-naming-
# style.
#module-rgx=
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty
# Naming style matching correct variable names.
variable-naming-style=snake_case
# Regular expression matching correct variable names. Overrides variable-
# naming-style.
#variable-rgx=
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,
_cb
# A regular expression matching the name of dummy variables (i.e. expected to
# not be used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored. Default to name
# with leading underscore.
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
[SPELLING]
# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4
# Spelling dictionary name. Available dictionaries: none. To make it work,
# install the python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=120
# Maximum number of lines in a module.
max-module-lines=1000
# List of optional constructs for which whitespace checking is disabled. `dict-
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
# `empty-line` allows space-only lines.
no-space-check=trailing-comma,
dict-separator
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[SIMILARITIES]
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
# Minimum lines number of a similarity.
min-similarity-lines=4
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
__new__,
setUp,
__post_init__
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,
_fields,
_replace,
_source,
_make
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=cls
[DESIGN]
# Maximum number of arguments for function / method.
max-args=5
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
# Maximum number of branch for function / method body.
max-branches=12
# Maximum number of locals for function / method body.
max-locals=15
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body.
max-returns=6
# Maximum number of statements in function / method body.
max-statements=50
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
[IMPORTS]
# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=optparse,tkinter.tix
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled).
ext-import-graph=
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled).
import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled).
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Couples of modules and preferred modules, separated by a comma.
preferred-modules=
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "BaseException, Exception".
overgeneral-exceptions=BaseException,
Exception
================================================
FILE: Makefile
================================================
SAGEMAKER_TRAIN_CONFIG_PATH=models/gradient_boost/envs/sagemaker/configs/train-gpu.json
SAGEMAKER_DEPLOY_CONFIG_PATH=models/gradient_boost/envs/sagemaker/configs/deploy.json
TRAIN_PARAM_PATH=models/gradient_boost/training_params.json
TRAIN_PATH=dataset/train.h5
VAL_PATH=dataset/val.h5
TEST_PATH=dataset/test.h5
clean:
-rm -r -f build
mkdir build
test:
python -m pytest tests
lint:
pylint ml_ids
lint-errors:
pylint ml_ids -E
typecheck:
mypy ml_ids
split_dataset:
mkdir -p dataset
python ./ml_ids/data/split_dataset.py \
--dataset-path $(DATASET_PATH) \
--output-path dataset \
--random-seed 42
train_local:
python ./models/gradient_boost/envs/local/train.py \
--train-path $(TRAIN_PATH) \
--val-path $(VAL_PATH) \
--test-path $(TEST_PATH) \
--output-path build/models/gradient_boost \
--param-path $(TRAIN_PARAM_PATH)
sagemaker_build_image:
./models/gradient_boost/envs/sagemaker/scripts/build_image.sh ml-ids-train-sagemaker $(TAG)
sagemaker_push_image:
./models/gradient_boost/envs/sagemaker/scripts/push_image_to_ecr.sh ml-ids-train-sagemaker $(TAG) | grep -Po '(?<=^image-name=).*' > sagemaker-image-name.txt
sagemaker_train_local:
python ./models/gradient_boost/envs/sagemaker/scripts/train.py \
--config-path $(SAGEMAKER_TRAIN_CONFIG_PATH) \
--param-path $(TRAIN_PARAM_PATH) \
--mode LOCAL \
--image-name "ml-ids-train-sagemaker:$(TAG)" \
--job-id "ml-ids-sagemaker-job"
sagemaker_train_aws:
python ./models/gradient_boost/envs/sagemaker/scripts/train.py \
--config-path $(SAGEMAKER_TRAIN_CONFIG_PATH) \
--param-path $(TRAIN_PARAM_PATH) \
--mode AWS \
--image-name $(SAGEMAKER_IMAGE_NAME) \
--job-id $(JOB_ID)
sagemaker_deploy:
python ./models/gradient_boost/envs/sagemaker/scripts/deploy.py \
--config-path $(SAGEMAKER_DEPLOY_CONFIG_PATH) \
--job-id $(JOB_ID)
sagemaker_undeploy:
python ./models/gradient_boost/envs/sagemaker/scripts/undeploy.py \
--config-path $(SAGEMAKER_DEPLOY_CONFIG_PATH)
================================================
FILE: README.md
================================================
# A machine learning based approach towards building an Intrusion Detection System
## Problem Description
With the rising amount of network enabled devices connected to the internet such as mobile phones, IOT appliances or vehicles the concern about the security implications of using these devices is growing. The increase in numbers and types of networked devices inevitably leads to a wider surface of attack whereas the impact of successful attacks is becoming increasingly severe as more critical responsibilities are assumed be these devices.
To identify and counter network attacks it is common to employ a combination of multiple systems in order to prevent attacks from happening or to detect and stop ongoing attacks if they can not be prevented initially.
These systems are usually comprised of an intrusion prevention system such as a firewall as the first layer of security with intrusion detection systems representing the second layer.
Should the intrusion prevention system be unable to prevent a network attack it is the task of the detection system to identify malicious network traffic in order to stop the ongoing attack and keep the recorded network traffic data for later analysis. This data can subsequently be used to update the prevention system to allow for the detection of the specific network attack in the future. The need for intrusion detection systems is rising as absolute prevention against attacks is not possible due to the rapid emergence of new attack types.
Even though intrusion detection systems are an essential part of network security many detection systems deployed today have a significant weakness as they facilitate signature-based attack classification patterns which are able to detect the most common known attack patterns but have the drawback of being unable to detect novel attack types.
To overcome this limitation research in intrusion detection systems is focusing on more dynamic approaches based on machine learning and anomaly detection methods. In these systems the normal network behaviour is learned by processing previously recorded benign data packets which allows the system to identify new attack types by analyzing network traffic for anomalous data flows.
This project aims to implement a classifier capable of identifying network traffic as either benign or malicious based on machine learning and deep learning methodologies.
## Data
The data used to train the classifier is taken from the [CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-2018.html) dataset provided by the Canadian Institute for Cybersecurity. It was created by capturing all network traffic during ten days of operation inside a controlled network environment on AWS where realistic background traffic and different attack scenarios were conducted.
As a result the dataset contains both benign network traffic as well as captures of the most common network attacks.
The dataset is comprised of the raw network captures in pcap format as well as csv files created by using [CICFlowMeter-V3](https://www.unb.ca/cic/research/applications.html#CICFlowMeter) containing 80 statistical features of the individual network flows combined with their corresponding labels.
A network flow is defined as an aggregation of interrelated network packets identified by the following properties:
* Source IP
* Destination IP
* Source port
* Destination port
* Protocol
The dataset contains approximately 16 million individual network flows and covers the following attack scenarios:
* Brute Force
* DoS,
* DDos
* Heartbleed,
* Web Attack,
* Infiltration,
* Botnet
## Approach
The goal of this project is to create a classifier capable of categorising network flows as either benign or malicious.
The problem is understood as a supervised learning problem using the labels provided in the dataset which identify the network flows as either benign or malicious. Different approaches of classifying the data will be evaluated to formulate the problem either as a binary classification or a multiclass classification problem differentiating between the individual classes of attacks provided in the dataset in the later case. A relevant subset of the features provided in the dataset will be used as predictors to classify individual network flows.
Machine learning methods like k-nearest neighbours, random forest or SVM will be applied to the problem and evaluated in the first step in order to assess the feasibility of using traditional machine learning approaches.
Subsequently deep learning models like convolutional neural networks, autoencoders or recurrent neural networks will be employed to create a competing classifier as recent research has shown that deep learning methods represent a promising application in the field of anomaly detection.
The results of both approaches will be compared to select the best performing classifier.
## Deliverables
The classifier will be deployed and served via a REST API in conjunction with a simple web application providing a user interface to utilize the API.
The REST API will provide the following functionality:
* an endpoint to submit network capture files in pcap format. Individual network flows are extracted from the capture files and analysed for malicious network traffic.
* (optional) an endpoint to stream continuous network traffic captures which are analysed in near real-time combined with
* (optional) an endpoint to register a web-socket in order to get notified upon detection of malicious network traffic.
To further showcase the project, a testbed could be created against which various attack scenarios can be performed. This testbed would be connected to the streaming API for near real-time detection of malicious network traffic.
## Computational resources
The requirements regarding the computational resources to train the classifiers are given below:
| Category | Resource |
| ------------- | ------------- |
| CPU | Intel Core i7 processor |
| RAM | 32 GB |
| GPU | 1 GPU, 8 GB RAM |
| HDD | 100 GB |
## Classifier
The machine learning estimator created in this project follows a supervised approach and is trained using the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) algorithm. Employing the [CatBoost](https://catboost.ai/) library a binary classifier is created, capable of classifying network flows as either benign or malicious. The chosen parameters of the classifier and its performance metrics can be examined in the following [notebook](https://github.com/cstub/ml-ids/blob/master/notebooks/07_binary_classifier_comparison/binary-classifier-comparison.ipynb).
## Deployment Architecture
The deployment architecture of the complete ML-IDS system is explained in detail in the [system architecture](https://docs.google.com/document/d/1s_EBMTid4gdrsQU_xOCAYK1BzxkhhnYl6wHFSZo_9Tw/edit?usp=sharing).
## Model Training and Deployment
The model can be trained and deployed either locally or via [Amazon SageMaker](https://aws.amazon.com/sagemaker/).
In each case the [MLflow](https://www.mlflow.org/docs/latest/index.html) framework is utilized to train the model and create the model artifacts.
### Installation
To install the necessary dependencies checkout the project and create a new Anaconda environment from the environment.yml file.
```
conda env create -f environment.yml
```
Afterwards activate the environment and install the project resources.
```
conda activate ml-ids
pip install -e .
```
### Dataset Creation
To create the dataset for training use the following command:
```
make split_dataset \
DATASET_PATH={path-to-source-dataset}
```
This command will read the source dataset and split the dataset into separate train/validation/test sets with a sample ratio of 80%/10%/10%. The specified source dataset should be a folder containing multiple `.csv` files.
You can use the [CIC-IDS-2018 dataset](https://www.unb.ca/cic/datasets/ids-2018.html) provided via [Google Drive](https://drive.google.com/open?id=1HrTPh0YRSZ4T9DLa_c47lubheKUcPl0r) for this purpose.
Once the command completes a new folder `dataset` is created that contains the splitted datasets in `.h5` format.
### Local Mode
To train the model in local mode, using the default parameters and dataset locations created by `split_dataset`, use the following command:
```
make train_local
```
If the datasets are stored in a different location or you want to specify different training parameters, you can optionally supply the dataset locations and a training parameter file:
```
make train_local \
TRAIN_PATH={path-to-train-dataset} \
VAL_PATH={path-to-train-dataset} \
TEST_PATH={path-to-train-dataset} \
TRAIN_PARAM_PATH={path-to-param-file}
```
Upon completion of the training process the model artifacts can be found in the `build/models/gradient_boost` directory.
To deploy the model locally the MLflow CLI can be used.
```
mlflow models serve -m build/models/gradient_boost -p 5000
```
The model can also be deployed as a Docker container using the following commands:
```
mlflow models build-docker -m build/models/gradient_boost -n ml-ids-classifier:1.0
docker run -p 5001:8080 ml-ids-classifier:1.0
```
### Amazon SageMaker
To train the model on Amazon SageMaker the following command sequence is used:
```
# build a new docker container for model training
make sagemaker_build_image \
TAG=1.0
# upload the container to AWS ECR
make sagemaker_push_image \
TAG=1.0
# execute the training container on Amazon SageMaker
make sagemaker_train_aws \
SAGEMAKER_IMAGE_NAME={ecr-image-name}:1.0 \
JOB_ID=ml-ids-job-0001
```
This command requires a valid AWS account with the appropriate permissions to be configured locally via the [AWS CLI](https://aws.amazon.com/cli/). Furthermore, [AWS ECR](https://aws.amazon.com/ecr/) and Amazon SageMaker must be configured for the account.
Using this repository, the manual invocation of the aforementioned commands is not necessary as training on Amazon SageMaker is supported via a [GitHub workflow](https://github.com/cstub/ml-ids/blob/master/.github/workflows/train.yml) that is triggered upon creation of a new tag of the form `m*` (e.g. `m1.0`).
To deploy a trained model on Amazon SageMaker a [GitHub Deployment request](https://developer.github.com/v3/repos/deployments/) using the GitHub API must be issued, specifying the tag of the model.
```
{
"ref": "refs/tags/m1.0",
"payload": {},
"description": "Deploy request for model version m1.0",
"auto_merge": false
}
```
This deployment request triggers a [GitHub workflow](https://github.com/cstub/ml-ids/blob/master/.github/workflows/deployment.yml), deploying the model to SageMaker.
After successful deployment the model is accessible via the SageMaker HTTP API.
## Using the Classifier
The classifier deployed on Amazon SageMaker is not directly available publicly, but can be accessed using the [ML-IDS REST API](https://github.com/cstub/ml-ids-api).
### REST API
To invoke the REST API the following command can be used to submit a prediction request for a given network flow:
```
curl -X POST \
http://ml-ids-cluster-lb-1096011980.eu-west-1.elb.amazonaws.com/api/predictions \
-H 'Accept: */*' \
-H 'Content-Type: application/json; format=pandas-split' \
-H 'Host: ml-ids-cluster-lb-1096011980.eu-west-1.elb.amazonaws.com' \
-H 'cache-control: no-cache' \
-d '{"columns":["dst_port","protocol","timestamp","flow_duration","tot_fwd_pkts","tot_bwd_pkts","totlen_fwd_pkts","totlen_bwd_pkts","fwd_pkt_len_max","fwd_pkt_len_min","fwd_pkt_len_mean","fwd_pkt_len_std","bwd_pkt_len_max","bwd_pkt_len_min","bwd_pkt_len_mean","bwd_pkt_len_std","flow_byts_s","flow_pkts_s","flow_iat_mean","flow_iat_std","flow_iat_max","flow_iat_min","fwd_iat_tot","fwd_iat_mean","fwd_iat_std","fwd_iat_max","fwd_iat_min","bwd_iat_tot","bwd_iat_mean","bwd_iat_std","bwd_iat_max","bwd_iat_min","fwd_psh_flags","bwd_psh_flags","fwd_urg_flags","bwd_urg_flags","fwd_header_len","bwd_header_len","fwd_pkts_s","bwd_pkts_s","pkt_len_min","pkt_len_max","pkt_len_mean","pkt_len_std","pkt_len_var","fin_flag_cnt","syn_flag_cnt","rst_flag_cnt","psh_flag_cnt","ack_flag_cnt","urg_flag_cnt","cwe_flag_count","ece_flag_cnt","down_up_ratio","pkt_size_avg","fwd_seg_size_avg","bwd_seg_size_avg","fwd_byts_b_avg","fwd_pkts_b_avg","fwd_blk_rate_avg","bwd_byts_b_avg","bwd_pkts_b_avg","bwd_blk_rate_avg","subflow_fwd_pkts","subflow_fwd_byts","subflow_bwd_pkts","subflow_bwd_byts","init_fwd_win_byts","init_bwd_win_byts","fwd_act_data_pkts","fwd_seg_size_min","active_mean","active_std","active_max","active_min","idle_mean","idle_std","idle_max","idle_min"],"data":[[80,17,"21\\/02\\/2018 10:15:06",119759145,75837,0,2426784,0,32,32,32.0,0.0,0,0,0.0,0.0,20263.87212,633.2460039,1579.1859130859,31767.046875,920247,1,120000000,1579.1859130859,31767.046875,920247,1,0,0.0,0.0,0,0,0,0,0,0,606696,0,633.2460327148,0.0,32,32,32.0,0.0,0.0,0,0,0,0,0,0,0,0,0,32.0004234314,32.0,0.0,0,0,0,0,0,0,75837,2426784,0,0,-1,-1,75836,8,0.0,0.0,0,0,0.0,0.0,0,0]]}'
```
### ML-IDS API Clients
For convenience, the Python clients implemented in the [ML-IDS API Clients project](https://github.com/cstub/ml-ids-api-client) can be used to submit new prediction requests to the API and receive real-time notifications on detection of malicious network flows.
================================================
FILE: data/README.md
================================================
## Data
The data used to train the classifiers is taken from the [CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-2018.html) dataset provided by the Canadian Institute for Cybersecurity.
It was created by capturing all network traffic during ten days of operation inside a controlled network environment on AWS where realistic background traffic and different attack scenarios were conducted.
The dataset consists of raw network captures in pcap format as well as processed csv files created by using [CICFlowMeter-V3](https://www.unb.ca/cic/research/applications.html#CICFlowMeter) containing 80 statistical features of the individual network flows combined with their corresponding labels.
Due to size limitations the data provided in this repository represents only a small portion of the dataset in form of processed network flows. The full dataset consisting of the raw network captures and the processed csv files can be retrieved from AWS S3.
## Download
A prerequisite to downloading the full dataset is the installation of the [AWS CLI](https://aws.amazon.com/cli/).
To download the processed csv files containing the analyzed network flows (~7GB) run the following command:
```bash
aws s3 sync --no-sign-request --region <your-region> "s3://cse-cic-ids2018/Processed Traffic Data for ML Algorithms/" <dest-dir>
```
To download the raw network captures in pcap format (~477GB) run:
```bash
aws s3 sync --no-sign-request --region <your-region> "s3://cse-cic-ids2018/Original Network Traffic and Log data/" <dest-dir>
```
To download the full dataset containing the raw network captures and processed csv files (~484GB) use the following command:
```bash
aws s3 sync --no-sign-request --region <your-region> "s3://cse-cic-ids2018/" <dest-dir>
```
## Preprocessed Dataset
The preprocessed dataset used for model training and evaluation can be found at [Google Drive](https://drive.google.com/drive/folders/1AWhRsVShJ_KvYKrV0VlnM1odtJ4Tp-uC?usp=sharing).
================================================
FILE: data/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv
================================================
version https://git-lfs.github.com/spec/v1
oid sha256:acff8bc61376ee031d80878ee6099e0b1a87a1bd711d8068298421418c9f8147
size 358223333
================================================
FILE: environment-notebook.yaml
================================================
name: ml-ids-notebooks
channels:
- anaconda
- conda-forge
- defaults
dependencies:
- catboost=0.18.1=py37_0
- click=7.0=py37_0
- cloudpickle=1.2.2=py_0
- eli5=0.10.1=py37_1
- findspark=1.3.0=py_1
- imbalanced-learn=0.5.0=py_0
- jupyter=1.0.0=py_2
- matplotlib=3.1.1=py37_1
- numpy=1.17.2=py37h95a1406_0
- pandas=0.25.2=py37hb3f55d8_0
- pip=19.2.3=py37_0
- pyspark=2.4.4=py_0
- pytest=5.2.1=py37_0
- pytest-runner=5.1=py_0
- python=3.7.3=h33d41f4_1
- python-dateutil<2.8.1
- requests<2.21.0
- scikit-learn=0.21.3=py37hcdab131_0
- scikit-plot=0.3.7=py_1
- scipy=1.3.1=py37h921218d_2
- seaborn=0.9.0=py_1
- setuptools=41.6.0=py37_1
- shap=0.31.0=py37hb3f55d8_0
- pip:
- h5py==2.10.0
- hyperopt==0.2.2
- keras==2.3.1
- keras-applications==1.0.8
- keras-preprocessing==1.1.0
- tables==3.6.1
- tensorboard==2.0.0
- tensorflow-estimator==2.0.0
- tensorflow-gpu==2.0.0
================================================
FILE: environment.yaml
================================================
name: ml-ids
channels:
- anaconda
- conda-forge
- defaults
dependencies:
- catboost=0.18.1=py37_0
- click=7.0=py37_0
- cloudpickle=1.2.2=py_0
- imbalanced-learn=0.5.0=py_0
- matplotlib=3.1.1=py37_1
- mypy=0.750
- numpy=1.17.2=py37h95a1406_0
- pandas=0.25.2=py37hb3f55d8_0
- pip=19.2.3=py37_0
- pylint=2.4.4
- pytest=5.2.1=py37_0
- pytest-runner=5.1=py_0
- python=3.7.3=h33d41f4_1
- python-dateutil<2.8.1
- requests<2.21.0
- scikit-learn=0.21.3=py37hcdab131_0
- scipy=1.3.1=py37h921218d_2
- seaborn=0.9.0=py_1
- setuptools=41.6.0=py37_1
- pip:
- mlflow==1.4
- sagemaker==1.44.3
- h5py==2.10.0
- hyperopt==0.2.2
- keras==2.3.1
- keras-applications==1.0.8
- keras-preprocessing==1.1.0
- tables==3.6.1
- tensorflow-estimator==2.0.0
- tensorflow-gpu==2.0.0
================================================
FILE: ml_ids/__init__.py
================================================
================================================
FILE: ml_ids/conf.py
================================================
"""
Global configuration variables.
"""
import os
ROOT_DIR = os.sep.join(os.path.dirname(os.path.abspath(__file__)).split(os.sep)[:-1])
TEST_DIR = os.path.join(ROOT_DIR, 'tests')
TEST_DATA_DIR = os.path.join(TEST_DIR, 'validation_data')
================================================
FILE: ml_ids/data/__init__.py
================================================
================================================
FILE: ml_ids/data/dataset.py
================================================
"""
Utilities to manipulate the CIC-IDS-2018 dataset.
"""
from typing import List
import os
import glob
import numpy as np
import pandas as pd
import ml_ids.data.metadata as md
def remove_inf_values(df: pd.DataFrame) -> pd.DataFrame:
"""
Replaces values of type `np.inf` and `-np.inf` in a DataFrame with `null` values.
:param df: Input DataFrame.
:return: The DataFrame without `np.inf` and `-np.inf` values.
"""
inf_columns = [c for c in df.columns if df[df[c] == np.inf][c].count() > 0]
for col in inf_columns:
df[col].replace([np.inf, -np.inf], np.nan, inplace=True)
return df
def remove_negative_values(df: pd.DataFrame, ignore_cols: List[str] = None) -> pd.DataFrame:
"""
Removes negative values in a DataFrame with `null` values.
:param df: Input DataFrame.
:param ignore_cols: Columns to ignore. Negative values in this columns will be preserved.
:return: The DataFrame without negative values.
"""
if ignore_cols is None:
ignore_cols = []
numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(ignore_cols).values
columns = [c for c in numeric_cols if df[df[c] < 0][c].count() > 0]
for col in columns:
mask = df[col] < 0
df.loc[mask, col] = np.nan
return df
def add_label_category_column(df: pd.DataFrame) -> pd.DataFrame:
"""
Adds the column `label_cat` to the DataFrame specifying the category of the label.
:param df: Input DataFrame.
:return: The DataFrame containing a new column `label_cat`.
"""
df[md.COLUMN_LABEL_CAT] = df.label.apply(lambda l: md.LABEL_CAT_MAPPING[l])
return df
def add_label_is_attack_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Adds the column `label_is_attack` to the DataFrame containing a binary indicator specifying if a row is of category
`benign = 0` or `attack = 1`.
:param df: Input DataFrame.
:return: The DataFrame containing a new column `label_is_attack`.
"""
df[md.COLUMN_LABEL_IS_ATTACK] = df.label.apply(lambda l: 0 if l == md.LABEL_BENIGN else 1)
return df
def load_dataset_generic(load_df_fn,
dataset_path: str,
use_cols: List[str] = None,
omit_cols: List[str] = None,
preserve_neg_value_cols: list = None,
transform_data: bool = True) -> pd.DataFrame:
"""
Loads the dataset from the given path using the supplied function.
All invalid values (`np.inf`, `-np.inf`, negative) are removed and replaced with `null` for easy imputation.
Negative values of columns specified in `preserve_neg_value_cols` will be preserved.
:param load_df_fn: Function used to load the dataset.
:param dataset_path: Path of the base directory containing all files of the dataset.
:param use_cols: Columns to load.
:param omit_cols: Columns to omit.
:param nrows: Number of rows to load per file.
:param transform_data: Indicates if data should be manipulated (removal of invalid and negative values).
:param preserve_neg_value_cols: Columns in which negative values are preserved.
:return: The dataset as a DataFrame.
"""
cols = None
if use_cols:
cols = use_cols
if omit_cols:
cols = [c for c in md.COLUMN_DTYPES.keys() if c not in omit_cols]
df = load_df_fn(dataset_path, cols)
if transform_data:
df = remove_inf_values(df)
df = remove_negative_values(df, preserve_neg_value_cols)
if md.COLUMN_LABEL in df.columns:
df = add_label_category_column(df)
df = add_label_is_attack_columns(df)
return df
def load_dataset(dataset_path: str,
use_cols: List[str] = None,
omit_cols: List[str] = None,
nrows: int = None,
transform_data: bool = True,
preserve_neg_value_cols: list = None) -> pd.DataFrame:
"""
Loads the dataset in CSV format from the given path.
All invalid values (`np.inf`, `-np.inf`, negative) are removed and replaced with `null` for easy imputation.
Negative values of columns specified in `preserve_neg_value_cols` will be preserved.
:param dataset_path: Path of the base directory containing all files of the dataset.
:param use_cols: Columns to load.
:param omit_cols: Columns to omit.
:param nrows: Number of rows to load per file.
:param transform_data: Indicates if data should be manipulated (removal of invalid and negative values).
:param preserve_neg_value_cols: Columns in which negative values are preserved.
:return: The dataset as a DataFrame.
"""
def load_csv(path, cols):
files = glob.glob(os.path.join(path, '*.csv'))
return pd.concat([pd.read_csv(f, dtype=md.COLUMN_DTYPES, usecols=cols, nrows=nrows) for f in files])
return load_dataset_generic(load_df_fn=load_csv,
dataset_path=dataset_path,
use_cols=use_cols,
omit_cols=omit_cols,
preserve_neg_value_cols=preserve_neg_value_cols,
transform_data=transform_data)
def load_dataset_hdf(dataset_path: str,
use_cols: List[str] = None,
omit_cols: List[str] = None,
preserve_neg_value_cols: list = None,
transform_data: bool = True,
key: str = None) -> pd.DataFrame:
"""
Loads the dataset stored as a HDF file from the given path.
All invalid values (`np.inf`, `-np.inf`, negative) are removed and replaced with `null` for easy imputation.
Negative values of columns specified in `preserve_neg_value_cols` will be preserved.
:param dataset_path: Path of the base directory containing all files of the dataset.
:param use_cols: Columns to load.
:param omit_cols: Columns to omit.
:param preserve_neg_value_cols: Columns in which negative values are preserved.
:param transform_data: Indicates if data should be manipulated (removal of invalid and negative values).
:param key: Group identifier in the HDF store.
:return: The dataset as a DataFrame.
"""
def load_hdf(path, cols):
return pd.read_hdf(path, key=key, columns=cols)
return load_dataset_generic(load_df_fn=load_hdf,
dataset_path=dataset_path,
use_cols=use_cols,
omit_cols=omit_cols,
preserve_neg_value_cols=preserve_neg_value_cols,
transform_data=transform_data)
================================================
FILE: ml_ids/data/metadata.py
================================================
"""
Metadata of the CIC-IDS-2018 dataset.
"""
COLUMN_DTYPES = {
'dst_port': 'uint32',
'protocol': 'uint8',
'timestamp': 'object',
'flow_duration': 'int64',
'tot_fwd_pkts': 'uint32',
'tot_bwd_pkts': 'uint32',
'totlen_fwd_pkts': 'uint32',
'totlen_bwd_pkts': 'uint32',
'fwd_pkt_len_max': 'uint16',
'fwd_pkt_len_min': 'uint16',
'fwd_pkt_len_mean': 'float32',
'fwd_pkt_len_std': 'float32',
'bwd_pkt_len_max': 'uint16',
'bwd_pkt_len_min': 'uint16',
'bwd_pkt_len_mean': 'float32',
'bwd_pkt_len_std': 'float32',
'flow_byts_s': 'float64',
'flow_pkts_s': 'float64',
'flow_iat_mean': 'float32',
'flow_iat_std': 'float32',
'flow_iat_max': 'int64',
'flow_iat_min': 'int64',
'fwd_iat_tot': 'int64',
'fwd_iat_mean': 'float32',
'fwd_iat_std': 'float32',
'fwd_iat_max': 'int64',
'fwd_iat_min': 'int64',
'bwd_iat_tot': 'uint32',
'bwd_iat_mean': 'float32',
'bwd_iat_std': 'float32',
'bwd_iat_max': 'uint32',
'bwd_iat_min': 'uint32',
'fwd_psh_flags': 'uint8',
'bwd_psh_flags': 'uint8',
'fwd_urg_flags': 'uint8',
'bwd_urg_flags': 'uint8',
'fwd_header_len': 'uint32',
'bwd_header_len': 'uint32',
'fwd_pkts_s': 'float32',
'bwd_pkts_s': 'float32',
'pkt_len_min': 'uint16',
'pkt_len_max': 'uint16',
'pkt_len_mean': 'float32',
'pkt_len_std': 'float32',
'pkt_len_var': 'float32',
'fin_flag_cnt': 'uint8',
'syn_flag_cnt': 'uint8',
'rst_flag_cnt': 'uint8',
'psh_flag_cnt': 'uint8',
'ack_flag_cnt': 'uint8',
'urg_flag_cnt': 'uint8',
'cwe_flag_count': 'uint8',
'ece_flag_cnt': 'uint8',
'down_up_ratio': 'uint16',
'pkt_size_avg': 'float32',
'fwd_seg_size_avg': 'float32',
'bwd_seg_size_avg': 'float32',
'fwd_byts_b_avg': 'uint8',
'fwd_pkts_b_avg': 'uint8',
'fwd_blk_rate_avg': 'uint8',
'bwd_byts_b_avg': 'uint8',
'bwd_pkts_b_avg': 'uint8',
'bwd_blk_rate_avg': 'uint8',
'subflow_fwd_pkts': 'uint32',
'subflow_fwd_byts': 'uint32',
'subflow_bwd_pkts': 'uint32',
'subflow_bwd_byts': 'uint32',
'init_fwd_win_byts': 'int32',
'init_bwd_win_byts': 'int32',
'fwd_act_data_pkts': 'uint32',
'fwd_seg_size_min': 'uint8',
'active_mean': 'float32',
'active_std': 'float32',
'active_max': 'uint32',
'active_min': 'uint32',
'idle_mean': 'float32',
'idle_std': 'float32',
'idle_max': 'uint64',
'idle_min': 'uint64',
'label': 'category'
}
LABEL_BENIGN = 'Benign'
LABEL_CAT_MAPPING = {
'Benign': 0,
'Bot': 1,
'Brute Force -Web': 2,
'Brute Force -XSS': 3,
'DoS attacks-GoldenEye': 4,
'DoS attacks-Hulk': 5,
'DoS attacks-SlowHTTPTest': 6,
'DoS attacks-Slowloris': 7,
'DDOS attack-HOIC': 8,
'DDOS attack-LOIC-UDP': 9,
'DDoS attacks-LOIC-HTTP': 10,
'FTP-BruteForce': 11,
'Infilteration': 12,
'SQL Injection': 13,
'SSH-Bruteforce': 14,
'DDOS LOIT': 15,
'Heartbleed': 16,
'PortScan': 17
}
FEATURES_NO_VARIANCE = [
"bwd_blk_rate_avg",
"bwd_byts_b_avg",
"bwd_pkts_b_avg",
"bwd_psh_flags",
"bwd_urg_flags",
"fwd_blk_rate_avg",
"fwd_byts_b_avg",
"fwd_pkts_b_avg"
]
FEATURES_TO_IGNORE = [
'timestamp',
'dst_port',
'protocol'
]
FEATURES_PRESERVE_NEG_COLUMNS = [
'init_fwd_win_byts',
'init_bwd_win_byts'
]
COLUMN_LABEL = 'label'
COLUMN_LABEL_CAT = 'label_cat'
COLUMN_LABEL_IS_ATTACK = 'label_is_attack'
================================================
FILE: ml_ids/data/split_dataset.py
================================================
"""
CLI to split a single dataset into train/val/test sub-datasets.
"""
import os
import sys
import logging
import click
import pandas as pd
import ml_ids.data.metadata as md
from ml_ids.data.dataset import load_dataset
from ml_ids.model_selection import train_val_test_split
logging.basicConfig(
format='[%(asctime)s|%(module)s.py|%(levelname)s] %(message)s',
datefmt='%H:%M:%S',
level=logging.INFO,
stream=sys.stdout
)
@click.command()
@click.option('--dataset-path', type=click.Path(exists=True), required=True,
help='Path to the input dataset in .csv format. Can be a folder containing multiple files.')
@click.option('--output-path', type=click.Path(exists=True), required=True,
help='Path to store the output datasets.')
@click.option('--val-size', type=click.FloatRange(0, 1), default=0.1,
help='Fraction of the data used for the validation set.')
@click.option('--test-size', type=click.FloatRange(0, 1), default=0.1,
help='Fraction of the data used for the test set.')
@click.option('--nrows', type=int,
help='Number of rows to load per input file.')
@click.option('--random-seed', type=int,
help='Random seed.')
def split_dataset(dataset_path, output_path, val_size, test_size, nrows, random_seed):
"""
Runs the CLI.
"""
logging.info('Loading dataset from "%s"...', dataset_path)
dataset = load_dataset(dataset_path=dataset_path, transform_data=False, nrows=nrows)
train, val, test = train_val_test_split(dataset,
val_size=val_size,
test_size=test_size,
stratify_col=md.COLUMN_LABEL_CAT,
random_state=random_seed)
train = remove_extra_labels(train)
val = remove_extra_labels(val)
test = remove_extra_labels(test)
save_dataset(train, output_path, 'train')
save_dataset(val, output_path, 'val')
save_dataset(test, output_path, 'test')
logging.info('Processing complete.')
def remove_extra_labels(dataset: pd.DataFrame):
"""
Removes unused target labels.
:param dataset: Input dataset as Pandas DataFrame.
:return: Dataset without unused target labels.
"""
return dataset.drop(columns=[md.COLUMN_LABEL_CAT, md.COLUMN_LABEL_IS_ATTACK])
def save_dataset(dataset: pd.DataFrame, path: str, ds_type: str):
"""
Stores the given dataset in hdf format on the specified path.
:param dataset: Dataset as Pandas DataFrame.
:param path: Target path to store the dataset.
:param ds_type: Dataset type.
:return: None
"""
file_path = os.path.join(path, '{}.h5'.format(ds_type))
logging.info('Storing dataset "%s" of size %d to "%s"', ds_type, len(dataset), file_path)
dataset.to_hdf(file_path, 'ids_data', format='t', complevel=5, complib='zlib')
if __name__ == '__main__':
# pylint: disable=no-value-for-parameter
split_dataset()
================================================
FILE: ml_ids/keras/__init__.py
================================================
================================================
FILE: ml_ids/keras/callbacks.py
================================================
"""
Custom callbacks for Keras models.
"""
# pylint: disable=import-error
from tensorflow import keras
from tensorflow.keras import callbacks
K = keras.backend
class OneCycleScheduler(callbacks.Callback):
"""
Keras callback implementing a one-cycle learning-rate scheduler.
Provided by https://github.com/ageron/handson-ml2/blob/master/11_training_deep_neural_networks.ipynb.
"""
def __init__(self, iterations, max_rate, start_rate=None,
last_iterations=None, last_rate=None):
self.iterations = iterations
self.max_rate = max_rate
self.start_rate = start_rate or max_rate / 10
self.last_iterations = last_iterations or iterations // 10 + 1
self.half_iteration = (iterations - self.last_iterations) // 2
self.last_rate = last_rate or self.start_rate / 1000
self.iteration = 0
def _interpolate(self, iter1, iter2, rate1, rate2):
return ((rate2 - rate1) * (iter2 - self.iteration)
/ (iter2 - iter1) + rate1)
def on_batch_begin(self, batch, logs):
if self.iteration < self.half_iteration:
rate = self._interpolate(0, self.half_iteration, self.start_rate, self.max_rate)
elif self.iteration < 2 * self.half_iteration:
rate = self._interpolate(self.half_iteration, 2 * self.half_iteration,
self.max_rate, self.start_rate)
else:
rate = self._interpolate(2 * self.half_iteration, self.iterations,
self.start_rate, self.last_rate)
rate = max(rate, self.last_rate)
self.iteration += 1
K.set_value(self.model.optimizer.lr, rate)
================================================
FILE: ml_ids/keras/evaluation.py
================================================
"""
Utility functions to evaluate Keras models.
"""
PREDICT_BATCH_SIZE = 16384
def evaluate_model(model, X_train, y_train, X_val, y_val, metric_title):
"""
Prints the performance metrics of a Keras model by invoking the `evaluate` function of the model on the training
and validation dataset.
:param model: Keras model.
:param X_train: Predictor variables of the training dataset.
:param y_train: Target labels of the training dataset.
:param X_val: Predictor variables of the validation dataset.
:param y_val: Target labels of the validation dataset.
:param metric_title: Title of the metrics.
:return: None
"""
print('Evaluation:')
print('===========')
print(' {}'.format(metric_title))
print('Train: {}'.format(model.evaluate(X_train, y_train, batch_size=PREDICT_BATCH_SIZE, verbose=0)))
print('Val: {}'.format(model.evaluate(X_val, y_val, batch_size=PREDICT_BATCH_SIZE, verbose=0)))
================================================
FILE: ml_ids/keras/metrics.py
================================================
"""
Utilities to create custom metrics for Keras models.
"""
# pylint: disable=import-error
import gc
import numpy as np
from tensorflow import keras
from tensorflow.keras import callbacks
from sklearn.metrics import average_precision_score
K = keras.backend
class AveragePrecisionScoreMetric(callbacks.Callback):
"""
Keras callback calculating the average precision score for a given validation dataset using the
`average_precision_score` metric from Scikit-learn.
"""
def __init__(self, X_val, y_val, batch_size=4096):
super(AveragePrecisionScoreMetric, self).__init__()
self.X_val = X_val
self.y_val = y_val
self.batch_size = batch_size
def get_precision_score(self):
"""
Calculates the average precision score using scikit-learn.
"""
preds = self.model.predict(self.X_val, batch_size=self.batch_size)
# reduces memory consumption caused by a memory leak in `model.predict()` of Tensorflow 2
# https://github.com/tensorflow/tensorflow/issues/33009
gc.collect()
mse = np.mean(np.power(self.X_val - preds, 2), axis=1)
return average_precision_score(self.y_val, mse)
def on_epoch_end(self, epoch, logs):
"""
Invoked after each training epoch.
"""
auprc = self.get_precision_score()
logs['val_auprc'] = auprc
print(' - val_auprc: {0:.4f}'.format(auprc))
================================================
FILE: ml_ids/keras/model_selection.py
================================================
"""
Utility functions for model selection of Keras models.
"""
import gc
from typing import Tuple
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tensorflow import keras
def cross_val_train(fit_fn,
X: np.ndarray,
y: np.ndarray,
target_transform_fn=id,
target_stratify_fn=id,
n_splits: int = 3,
fit_args: dict = None,
random_state: int = None) -> Tuple[np.ndarray, np.ndarray, list]:
"""
Performs stratified cross-validation for a Keras model using the provided fit function.
:param fit_fn: The function used to fit a model with a given split of the train and test set. Must return a fitted
Keras model with its history.
:param X: Predictor variables.
:param y: Labels.
:param target_transform_fn: Function to transform the target labels (e.g. one-hot encoding).
:param target_stratify_fn: Function to extract the target label to stratify by.
:param n_splits: Number of cross-validation splits.
:param fit_args: Arguments to pass to the fit function.
:param random_state: Random state.
:return: A triple containing the cross-validation predictions, the true values and a list of history-objects.
"""
if fit_args is None:
fit_args = {}
kfold = StratifiedKFold(n_splits=n_splits, random_state=random_state)
cv_predictions = None
cv_y_true = None
hists = []
fold = 1
for train_index, val_index in kfold.split(X, target_stratify_fn(y)):
print('\nFold {}/{}:'.format(fold, n_splits))
print('==========')
X_train, X_val = X[train_index], X[val_index]
y_train, y_val = y[train_index], y[val_index]
y_train_ = target_transform_fn(y_train)
y_val_ = target_transform_fn(y_val)
keras.backend.clear_session()
gc.collect()
model, hist = fit_fn(X_train, y_train_, X_val, y_val_, fit_args, (fold == 1))
if isinstance(hist, list):
hists.extend(hist)
else:
hists.append(hist)
if cv_predictions is not None:
cv_predictions = np.append(cv_predictions, model.predict(X_val), axis=0)
else:
cv_predictions = model.predict(X_val)
if cv_y_true is not None:
cv_y_true = np.append(cv_y_true, y_val, axis=0)
else:
cv_y_true = y_val
fold = fold + 1
return cv_predictions, cv_y_true, hists
================================================
FILE: ml_ids/keras/prediction.py
================================================
"""
Utility functions to create predictions using Keras models.
"""
PREDICT_BATCH_SIZE = 16384
def predict(model, X, decision_boundary=0.5):
"""
Performs predictions for a binary classification task given a Keras model and a decision boundary.
If the probability of a sample belonging to the positive class exceeds the decision boundary the positive label
is assigned to the sample, otherwise the negative label is used.
:param model: Keras model.
:param X: Dataset containing samples.
:param decision_boundary: Decision boundary used to assign predictions to the positive class.
:return: numpy array containing the binary predictions as one of the values {0, 1}.
"""
pred = model.predict(X, batch_size=PREDICT_BATCH_SIZE)
return (pred >= decision_boundary).astype('int').reshape(-1)
def predict_proba(model, X):
"""
Performs predictions for a binary classification task given a Keras model.
This function returns the class probability of the positive class.
:param model: Keras model.
:param X: Dataset containing samples.
:return: numpy array containing the class probabilities of the positive class.
"""
return model.predict(X, batch_size=PREDICT_BATCH_SIZE).reshape(-1)
================================================
FILE: ml_ids/libs/dfencoder/dataframe.py
================================================
# Copyright (c) 2019, Michael Klear.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
#
# * Neither the name of the dfencoder Developers nor the names of any
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import pandas as pd
import numpy as np
class EncoderDataFrame(pd.DataFrame):
def __init__(self, *args, **kwargs):
super(EncoderDataFrame, self).__init__(*args, **kwargs)
def swap(self, likelihood=.15):
"""
Performs random swapping of data.
Each value has a likelihood of *argument likelihood*
of being randomly replaced with a value from a different
row.
Returns a copy of the dataframe with equal size.
"""
# select values to swap
tot_rows = self.__len__()
n_rows = int(round(tot_rows * likelihood))
n_cols = len(self.columns)
def gen_indices():
column = np.repeat(np.arange(n_cols).reshape(1, -1), repeats=n_rows, axis=0)
row = np.random.randint(0, tot_rows, size=(n_rows, n_cols))
return row, column
row, column = gen_indices()
new_mat = self.values
to_place = new_mat[row, column]
row, column = gen_indices()
new_mat[row, column] = to_place
dtypes = {col: typ for col, typ in zip(self.columns, self.dtypes)}
result = EncoderDataFrame(columns=self.columns, data=new_mat)
result = result.astype(dtypes, copy=False)
return result
================================================
FILE: ml_ids/model_selection.py
================================================
"""
Utilities for machine learning model selection.
"""
from typing import Tuple, List
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
def train_val_test_split(df: pd.DataFrame,
val_size: float = 0.1,
test_size: float = 0.1,
stratify_col: str = None,
random_state: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Splits the given DataFrame into three parts used for:
- training
- validation
- test
:param df: Input DataFrame.
:param val_size: Size of validation set.
:param test_size: Size of test set.
:param stratify_col: Column to stratify.
:param random_state: Random state.
:return: A triple containing (`train`, `val`, `test`) sets.
"""
assert (val_size + test_size) < 1, 'Sum of validation and test size must not be > 1.'
df_stratify = df[stratify_col] if stratify_col else None
df_train, df_hold = train_test_split(df,
test_size=(val_size + test_size),
stratify=df_stratify,
random_state=random_state)
df_hold_stratify = df_hold[stratify_col] if stratify_col else None
df_val, df_test = train_test_split(df_hold,
test_size=test_size / (val_size + test_size),
stratify=df_hold_stratify,
random_state=random_state)
return df_train, df_val, df_test
def split_x_y(df: pd.DataFrame, y_cols: List[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Splits the given DataFrame into a DataFrame `X` containing the predictor variables and a DataFrame 'y' containing
the labels y.
:param df: Input DataFrame.
:param y_cols: Columns to use in the labels DataFrame `y`.
:return: A tuple containing the DataFrames (`X`, `y`).
"""
if y_cols is None:
y_cols = ['label', 'label_cat', 'label_is_attack']
return df.drop(columns=y_cols), df[y_cols]
def best_precision_for_target_recall(y_true, y_pred_score, target_recall):
"""
Determines the decision boundary for the best precision given a specified target recall by using
the precision-recall curve.
:param y_true: True labels.
:param y_pred_score: Predicted labels.
:param target_recall: Target recall.
:return: Decision boundary.
"""
_, recalls, thresholds = precision_recall_curve(y_true, y_pred_score)
return thresholds[np.argmin(recalls >= target_recall)]
================================================
FILE: ml_ids/models/__init__.py
================================================
================================================
FILE: ml_ids/models/gradient_boost/__init__.py
================================================
================================================
FILE: ml_ids/models/gradient_boost/mlflow_wrapper.py
================================================
"""
Wrapper to enable usage of a CatBoost estimator with MLflow.
"""
import pickle
import mlflow.pyfunc
from catboost import CatBoostClassifier
from ml_ids.data.dataset import remove_negative_values, remove_inf_values
class CatBoostWrapper(mlflow.pyfunc.PythonModel):
"""
MLflow wrapper for CatBoost estimators.
"""
def load_context(self, context):
# pylint: disable=attribute-defined-outside-init
with open(context.artifacts['pipeline'], 'rb') as f:
self.pipeline = pickle.load(f)
with open(context.artifacts['col_config'], 'rb') as f:
column_config = pickle.load(f)
self.clf = CatBoostClassifier()
self.clf.load_model(context.artifacts['cbm_model'])
self.col_names = column_config['col_names']
self.preserve_cols = column_config['preserve_neg_vals']
def preprocess(self, data):
"""
Applies the pre-processing pipeline to the features given in the input dataset.
:param data: Input dataset.
:return: Transformed dataset.
"""
data = data[self.col_names]
data = remove_inf_values(data)
data = remove_negative_values(data, ignore_cols=self.preserve_cols)
return self.pipeline.transform(data)
def predict(self, context, model_input):
X = self.preprocess(model_input)
return self.clf.predict(X)
================================================
FILE: ml_ids/models/gradient_boost/train.py
================================================
"""
Utilities to train a machine learning estimator based on the Gradient Boosting algorithm using the CatBoost library.
"""
import logging
from collections import namedtuple
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import FunctionTransformer
from ml_ids.transform.preprocessing import create_pipeline
from ml_ids.transform.sampling import upsample_minority_classes
from ml_ids.model_selection import split_x_y
LOGGER = logging.getLogger(__name__)
GradientBoostHyperParams = namedtuple('GradientBoostHyperParams',
['nr_iterations', 'tree_depth', 'l2_reg', 'border_count', 'random_strength',
'task_type'])
def fit_pipeline(train_dataset):
"""
Creates and fits the scikit-learn pre-processing pipeline.
:param train_dataset: Training dataset.
:return: Tuple of (fitted scikit-learn pipeline, column names).
"""
cols_to_impute = train_dataset.columns[train_dataset.isna().any()].tolist()
X_train, _ = split_x_y(train_dataset)
pipeline, get_col_names = create_pipeline(X_train,
imputer_strategy='median',
imputer_cols=cols_to_impute,
scaler=FunctionTransformer,
scaler_args={'validate': False})
pipeline.fit(X_train)
return pipeline, get_col_names()
def preprocess_val_dataset(pipeline, val_dataset):
"""
Pre-processes the validation dataset.
:param pipeline: Scikit-learn pipeline.
:param val_dataset: Validation dataset.
:return: Tuple of (transformed features, labels)
"""
X_val, y_val = split_x_y(val_dataset)
X_val = pipeline.transform(X_val)
return X_val, y_val.label_is_attack
def preprocess_train_dataset(pipeline, train_dataset, nr_attack_samples, random_state):
"""
Pre-processes the training dataset.
:param pipeline: Scikit-learn pipeline.
:param train_dataset: Training dataset.
:param nr_attack_samples: Minimum number of attack samples per category. If the actual number of samples in the
dataset is lower than this number the SMOTE algorithm will be used to upsample this category to have the requested
number of samples.
:return: Tuple of (transformed features, labels)
"""
X_train, y_train = split_x_y(train_dataset)
X_train = pipeline.transform(X_train)
X_train, y_train = upsample_minority_classes(X_train, y_train,
min_samples=nr_attack_samples,
random_state=random_state)
return X_train, (y_train != 0).astype('int')
def calculate_class_weights(y_train):
"""
Calculates the class weights of the unique classes in the training labels.
:param y_train: Training labels.
:return: Array of class weights.
"""
minority_class_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
return [1, minority_class_weight]
def train_gb_classifier(train_pool,
val_pool,
class_weights,
nr_iterations,
tree_depth,
l2_reg,
border_count,
random_strength,
task_type,
random_state=None):
"""
Trains an estimator based on the Gradient Boosting algorithm using the CatBoost library.
:param train_pool: Training dataset.
:param val_pool: Validation dataset.
:param class_weights: Class weights of the target labels.
:param nr_iterations: The maximum number of trees that can be built when solving machine learning problems.
:param tree_depth: Depth of a single tree.
:param l2_reg: Coefficient at the L2 regularization term of the cost function.
:param border_count: The number of splits for numerical features.
:param random_strength: The amount of randomness to use for scoring splits when the tree structure is selected.
:param task_type: The processing unit type to use for training (CPU | GPU).
:param random_state: State to initialize the random number generator.
:return: Trained CatBoost classifier.
"""
clf = CatBoostClassifier(loss_function='Logloss',
iterations=nr_iterations,
depth=tree_depth,
l2_leaf_reg=l2_reg,
border_count=border_count,
random_strength=random_strength,
task_type=task_type,
class_weights=class_weights,
verbose=1,
random_seed=random_state)
clf.fit(train_pool, eval_set=val_pool)
return clf
def train_model(train_dataset: pd.DataFrame,
val_dataset: pd.DataFrame,
hyper_params: GradientBoostHyperParams,
nr_attack_samples: int,
random_seed: int = None):
"""
Trains an estimator based on the Gradient Boosting algorithm using the CatBoost library.
:param train_dataset: Training dataset.
:param val_dataset: Validation dataset.
:param hyper_params: Hyper-parameters applied to the Gradient Boosting algorithm.
:param nr_attack_samples: Minimum number of attack samples per category. If the actual number of samples in the
dataset is lower than this number the SMOTE algorithm will be used to upsample this category to have the requested
number of samples.
:param random_seed: Seed to initialize the random number generator.
:return: Tuple of (CatBoost classifier, pre-processing pipeline, column names)
"""
LOGGER.info('Training model with parameters [samples-per-attack-category=%s, hyperparams=%s]',
nr_attack_samples,
hyper_params)
pipeline, col_names = fit_pipeline(train_dataset)
X_train, y_train = preprocess_train_dataset(pipeline, train_dataset, nr_attack_samples, random_seed)
train_pool = Pool(X_train, y_train)
if val_dataset is not None:
X_val, y_val = preprocess_val_dataset(pipeline, val_dataset)
val_pool = Pool(X_val, y_val)
else:
val_pool = None
clf = train_gb_classifier(train_pool=train_pool,
val_pool=val_pool,
class_weights=calculate_class_weights(y_train),
nr_iterations=hyper_params.nr_iterations,
tree_depth=hyper_params.tree_depth,
l2_reg=hyper_params.l2_reg,
border_count=hyper_params.border_count,
random_strength=hyper_params.random_strength,
task_type=hyper_params.task_type,
random_state=random_seed)
return clf, pipeline, col_names
================================================
FILE: ml_ids/prediction.py
================================================
"""
Utilities to create predictions given a Scikit-learn estimator and a dataset containing input features.
"""
def predict_proba_positive(clf, X):
"""
Performs predictions for a binary classification task given a scikit-learn model.
This function returns the class probability of the positive class.
:param clf: Scikit-learn estimator.
:param X: Dataset containing the samples.
:return: numpy array containing the class probabilities of the positive class.
"""
return clf.predict_proba(X)[:, 1].reshape(-1)
def predict_decision_boundary(clf, X, decision_boundary=0.5):
"""
Performs predictions for a binary classification task given a scikit-learn model and a decision boundary.
If the probability of a sample belonging to the positive class exceeds the decision boundary the positive label
is assigned to the sample, otherwise the negative label is used.
:param clf: Scikit-learn estimator.
:param X: Dataset containing samples.
:param decision_boundary: Decision boundary used to assign predictions to the positive class.
:return: numpy array containing the binary predictions as one of the values {0, 1}.
"""
pred = predict_proba_positive(clf, X)
return (pred >= decision_boundary).astype('int')
================================================
FILE: ml_ids/tf_utils.py
================================================
"""
Utility functions for TensorFlow.
"""
import tensorflow as tf
def enable_gpu_memory_growth():
"""
Enables the experimental setting `allow_memory_growth` for GPU devices
:return: None
"""
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
================================================
FILE: ml_ids/transform/__init__.py
================================================
================================================
FILE: ml_ids/transform/preprocessing.py
================================================
"""
Utilities for data pre-processing.
"""
from typing import List
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.exceptions import NotFittedError
from sklearn.base import BaseEstimator
def remove_outliers(df: pd.DataFrame, zscore: int = 3) -> pd.DataFrame:
"""
Removes all rows from the given DataFrame containing outliers in any of the columns.
:param df: Input DataFrame.
:param zscore: z-score to use when calculating outliers.
:return: The DataFrame with all outliers removed.
"""
scores = (df - df.mean()) / df.std(ddof=0).values
return df[(np.abs(scores) < zscore).all(axis=1)]
def create_pipeline(df: pd.DataFrame,
imputer_strategy: str = 'mean',
imputer_cols: List[str] = None,
scaler: BaseEstimator = StandardScaler,
scaler_args: dict = None,
cat_cols: List[str] = None,
copy: bool = True):
"""
Creates a pipeline performing the following steps:
- value imputation
- value scaling
- one-hot-encoding of categorical values.
:param df: Input DataFrame.
:param imputer_strategy: Imputer strategy applied to missing values.
Allowed values are ['mean', 'median', 'most_frequent', 'constant'].
:param imputer_cols: Columns to impute. If no columns are specified all columns will be imputed.
:param scaler: Scikit-learn scaler to be applied to all values.
:param scaler_args: Additional arguments forwarded to the specified scaler.
:param cat_cols: Categorical columns to be one-hot-encoded.
:param copy: If True, a copy of the input will be created.
:return: A tuple containing the pipeline and a function returning the columns names after the pipeline has been
fitted.
"""
def create_get_feature_names(p, imp, scl, cat):
def get_feature_names():
if not hasattr(p, 'transformers_'):
raise AssertionError('Pipeline is not yet fitted.')
try:
cat_names = p.transformers_[2][1].get_feature_names(cat)
except NotFittedError:
cat_names = []
return np.append(imp, np.append(scl, cat_names))
return get_feature_names
if scaler_args is None:
scaler_args = {}
cat_features = cat_cols if cat_cols else []
num_features = [c for c in df.select_dtypes(include=[np.number]).columns.values if c not in cat_features]
imp_features: List[str] = []
if imputer_strategy is not None:
imp_features = imputer_cols if imputer_cols else num_features
scale_features = [f for f in num_features if f not in imp_features]
imp_pipeline = Pipeline([
('imputer', SimpleImputer(missing_values=np.nan, strategy=imputer_strategy, copy=copy)),
('imp_scaler', scaler(**scaler_args))
])
pipeline = ColumnTransformer([
('imp', imp_pipeline, imp_features),
('scl', scaler(**scaler_args), scale_features),
('one_hot', OneHotEncoder(categories='auto'), cat_features)
])
return pipeline, create_get_feature_names(pipeline, imp_features, scale_features, cat_features)
================================================
FILE: ml_ids/transform/sampling.py
================================================
"""
Utilities to modify the amount of samples of specific categories in a datasets.
"""
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE, SMOTENC
from typing import Tuple, List
def upsample_minority_classes(X: np.ndarray,
y: pd.DataFrame,
min_samples: int,
random_state: int = None,
cat_cols: List[int] = None,
n_jobs: int = 24) -> Tuple[np.ndarray, np.ndarray]:
"""
Synthetic up-sampling of minority classes using `imblearn.over_sampling.SMOTE`.
:param X: Predictor variables.
:param y: Labels.
:param min_samples: Minimum samples of each class.
:param random_state: Random state.
:param cat_cols: Column indices of categorical features.
:param n_jobs: Number of threads to use.
:return: A tuple containing the up-sampled X and y values.
"""
counts = y.label_cat.value_counts()
sample_dict = {}
for i in np.unique(y.label_cat):
sample_dict[i] = max(counts[i], min_samples)
if cat_cols:
smote = SMOTENC(sampling_strategy=sample_dict,
categorical_features=cat_cols,
n_jobs=n_jobs,
random_state=random_state)
else:
smote = SMOTE(sampling_strategy=sample_dict, n_jobs=n_jobs, random_state=random_state)
x_s, y_s = smote.fit_resample(X, y.label_cat)
return x_s, y_s
def create_sample_dict(df: pd.DataFrame,
default_nr_samples: int,
samples_per_label: dict = None) -> dict:
"""
Creates a dictionary containing the number of samples per label.
:param df: Input DataFrame.
:param default_nr_samples: Default number of samples per label.
:param samples_per_label: Number of samples for specific labels.
:return: Dictionary containing the number of samples per label.
"""
if samples_per_label is None:
samples_per_label = {}
sample_dict = df.label_cat.value_counts().to_dict()
for label in sample_dict.keys():
requested_samples = samples_per_label[label] if label in samples_per_label else default_nr_samples
existing_samples = sample_dict[label] if label in sample_dict else 0
sample_dict[label] = min(requested_samples, existing_samples)
return sample_dict
def downsample(df: pd.DataFrame,
default_nr_samples: int,
samples_per_label: dict = None,
random_state: int = None) -> pd.DataFrame:
"""
Downsamples the given DataFrame to contain at most `default_nr_samples` per instance of label.
:param df: Input DataFrame.
:param default_nr_samples: Default number of samples per label.
:param samples_per_label: Number of samples for specific labels.
:param random_state: Random state.
:return: The downsampled DataFrame.
"""
if samples_per_label is None:
samples_per_label = {}
sample_dict = create_sample_dict(df, default_nr_samples, samples_per_label)
return pd.concat([df[df.label_cat == l].sample(n=n, random_state=random_state) for l, n in sample_dict.items()])
================================================
FILE: ml_ids/visualization.py
================================================
"""
Visualization utilities for IPython Notebooks.
"""
# pylint: disable=import-error
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.metrics import confusion_matrix, classification_report, average_precision_score, precision_recall_curve
from IPython.display import display
def plot_hist(hist,
metrics=None,
y_lim=None,
size=(8, 5),
ax=None):
"""
Plot a Keras history object.
:param hist: The Keras history.
:param metrics: A list of histories to plot.
:param y_lim: Limits the y-axis.
:param size: Size of the plot.
:param ax: Axis to apply the plot.
"""
if metrics is None:
metrics = ['loss', 'val_loss']
fig_size = size if not ax else None
df = pd.DataFrame(hist.history)[metrics]
df.plot(figsize=fig_size, ax=ax)
gca = ax if ax else plt.gca()
gca.xaxis.set_major_locator(MaxNLocator(integer=True))
if y_lim:
gca.set_ylim(y_lim)
if ax:
ax.grid(True)
else:
plt.grid(True)
plt.show()
def plot_confusion_matrix(y_true,
y_pred,
classes=None,
size=(10, 10),
normalize=False,
title=None,
print_raw=False,
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
:param y_true: True labels.
:param y_pred: Predicted labels.
:param classes: List of class names.
:param size: Size of the plot.
:param normalize: If True values of the confusion matrix will be normalized.
:param title: Title of the plot.
:param print_raw: If True the raw confusion matrix is printed.
:param cmap: Color map
"""
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
if print_raw:
print(cm)
fig, ax = plt.subplots(figsize=size)
im = ax.matshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
ax.set(title=title,
ylabel='True label',
xlabel='Predicted label')
if classes is not None:
x_labels = classes
y_labels = classes
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
xticklabels=x_labels,
yticklabels=y_labels)
plt.margins(2)
ax.tick_params(axis="x", bottom=True, labelbottom=True, top=False, labeltop=False, rotation=45)
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
return ax
def identity(x):
"""
Identity function.
"""
return x
def plot_threshold(pred_train, pred_val, threshold, size=(15, 5), transform=identity):
"""
Plots the reconstruction errors of training and test samples and displays the classification threshold.
:param pred_train: Predictions of training samples.
:param pred_val: Predictions of validation samples.
:param threshold: Classification threshold.
:param size: Size of the plot.
:param transform: Value transformation.
"""
_, ax = plt.subplots(figsize=size)
sns.distplot(transform(pred_train.rec_error.values), hist=False, ax=ax, label='Train Benign')
sns.distplot(transform(pred_val[pred_val.y_true == 0].rec_error.values), hist=False, ax=ax,
label='Validation Benign')
sns.distplot(transform(pred_val[pred_val.y_true == 1].rec_error.values), hist=False, ax=ax,
label='Validation Attack')
ax.axvline(transform(threshold), color='red', linestyle='--')
ax.legend()
def get_misclassifications(y, y_true, pred):
"""
Calculates the misclassification rate for each label.
:param y: Pandas DataFrame containing the target labels.
:param y_true: True labels.
:param pred: Predicted labels.
:return: Pandas DataFrame containing the misclassification per label.
"""
misclassifications = y[y_true != pred]
mc_df = pd.merge(pd.DataFrame({'misclassified': misclassifications.label.value_counts()}),
pd.DataFrame({'total': y.label.value_counts()}),
how='left', left_index=True, right_index=True)
mc_df['percent_misclassified'] = mc_df.apply(lambda x: x[0] / x[1], axis=1)
return mc_df.sort_values('percent_misclassified', ascending=False)
def print_binary_performance(y, y_true, pred, print_misclassifications=True, digits=3):
"""
Prints the performance of a binary classifier using
- the classification report,
- the confusion matrix and
- the misclassification report.
:param y: Pandas DataFrame containing the target labels (binary, categories).
:param y_true: True labels.
:param pred: Predicted labels.
:param print_misclassifications: Binary indicator instructing that the misclassification report should be printed.
:param digits: Number of digits used to print the classification report.
:return: None
"""
print('Classification Report:')
print('======================')
print(classification_report(y_true, pred, digits=digits))
print('Confusion Matrix:')
print('=================')
plot_confusion_matrix(y_true, pred, np.array(['Benign', 'Attack']), size=(5, 5))
plt.show()
if print_misclassifications:
print('Misclassifications by attack category:')
print('======================================')
mc_df = get_misclassifications(y, y_true, pred)
display(mc_df)
def plot_pr_curve(y_true, y_score, size=(8, 5), average='weighted'):
"""
Plots the precision-recall curve for a single estimator.
:param y_true: True labels.
:param y_score: Predicted probabilities.
:param size: Size of the plot.
:param average: Average parameter used for the calculation of the average precision score.
:return: None
"""
precisions, recalls, _ = precision_recall_curve(y_true, y_score)
pr_auc = average_precision_score(y_true, y_score, average=average)
plt.figure(figsize=size)
plt.plot(recalls, precisions, label='auc={}'.format(pr_auc))
plt.title('Precision / Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='lower left')
plt.show()
print('Average PR Score {}'.format(pr_auc))
def plot_pr_curves(y_true, y_score_dict, size=(8, 5), average='weighted'):
"""
Plots the precision-recall curve for a multiple estimators.
:param y_true: True labels.
:param y_score_dict: Dictionary containing the estimator name as keys and the predicted label probabilities
as values.
:param size: Size of the plot.
:param average: Average parameter used for the calculation of the average precision score.
:return: None
"""
plt.figure(figsize=size)
for name, y_score in y_score_dict.items():
precisions, recalls, _ = precision_recall_curve(y_true, y_score)
pr_auc = average_precision_score(y_true, y_score, average=average)
plt.plot(recalls, precisions, label='{} (AUC={})'.format(name, pr_auc))
plt.title('Precision / Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='lower left')
plt.show()
def plot_pr_threshold_curves(y_true, y_pred_score, size=(20, 8)):
"""
Plots the precision-recall values for different probability thresholds.
:param y_true: True labels.
:param y_pred_score: Predicted probabilities.
:param size: Size of the plot.
:return: None
"""
precisions, recalls, thresholds = precision_recall_curve(y_true, y_pred_score)
# plot precision / recall for different thresholds
plt.figure(figsize=size)
plt.plot(thresholds, precisions[:-1], label="Precision")
plt.plot(thresholds, recalls[:-1], label="Recall")
plt.title('Precision / Recall of different thresholds')
plt.xlabel('Threshold')
plt.ylabel('Precision / Recall')
plt.legend(loc='lower right')
plt.show()
================================================
FILE: models/gradient_boost/envs/local/train.py
================================================
import json
import click
import mlflow
import shutil
import os
def merge(dict1, dict2):
"""
Merges two dictionaries by creating copies of the dictionaries.
:param dict1: First dictionary to merge
:param dict2: Second dictionary to merge
:return: Merged dictionary
"""
d = dict(dict1)
d.update(dict2)
return d
@click.command()
@click.option('--train-path', type=click.Path(exists=True), required=True,
help='Path to the train dataset in .h5 format.')
@click.option('--val-path', type=click.Path(exists=True), required=True,
help='Path to the train dataset in .h5 format.')
@click.option('--test-path', type=click.Path(exists=True), required=True,
help='Path to the train dataset in .h5 format.')
@click.option('--output-path', type=click.Path(), required=True,
help='Path to store the output.')
@click.option('--param-path', type=click.Path(exists=True), required=True,
help='Path to the training parameters.')
def train(train_path, val_path, test_path, output_path, param_path):
with open(param_path, 'r') as f:
params = json.load(f)
shutil.rmtree(output_path, ignore_errors=True)
os.makedirs(output_path, exist_ok=True)
run_params = merge(params, {
'train_path': train_path,
'val_path': val_path,
'test_path': test_path,
'output_path': output_path,
'artifact_path': output_path,
})
mlflow.run('models/gradient_boost/project',
parameters=run_params)
if __name__ == '__main__':
train()
================================================
FILE: models/gradient_boost/envs/sagemaker/configs/deploy.json
================================================
{
"deploy": {
"app_name": "ml-ids-classifier",
"instance_type": "ml.t2.medium",
"instance_count": 1,
"region": "eu-west-1"
},
"role": "arn:aws:iam::763816190631:role/service-role/AmazonSageMaker-ExecutionRole-20191125T215860",
"model_bucket": "s3://sagemaker-eu-west-1-763816190631",
"model_artifact": "model.tar.gz",
"model_name": "ml-ids-gb_mlflow_pyfunc"
}
================================================
FILE: models/gradient_boost/envs/sagemaker/configs/train-cpu.json
================================================
{
"train": {
"instance_type": "ml.m5.large",
"instance_count": 1,
"task_type": "CPU"
},
"role": "arn:aws:iam::763816190631:role/service-role/AmazonSageMaker-ExecutionRole-20191125T215860",
"data": {
"train": "s3://ml-ids-2018-sm/training",
"val": "s3://ml-ids-2018-sm/validation",
"test": "s3://ml-ids-2018-sm/testing"
},
"model_bucket": "s3://sagemaker-eu-west-1-763816190631"
}
================================================
FILE: models/gradient_boost/envs/sagemaker/configs/train-gpu.json
================================================
{
"train": {
"instance_type": "ml.p2.xlarge",
"instance_count": 1,
"task_type": "GPU"
},
"role": "arn:aws:iam::763816190631:role/service-role/AmazonSageMaker-ExecutionRole-20191125T215860",
"data": {
"train": "s3://ml-ids-2018-full/training",
"val": "s3://ml-ids-2018-full/validation",
"test": "s3://ml-ids-2018-full/testing"
},
"model_bucket": "s3://sagemaker-eu-west-1-763816190631"
}
================================================
FILE: models/gradient_boost/envs/sagemaker/container/Dockerfile
================================================
FROM nvidia/cuda:10.1-base
# Install Miniconda 3
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
ENV PATH /opt/conda/bin:$PATH
RUN apt-get update --fix-missing && \
apt-get install -y wget bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 git mercurial subversion && \
apt-get clean
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh -O ~/miniconda.sh && \
/bin/bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda clean -tipsy && \
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate base" >> ~/.bashrc && \
find /opt/conda/ -follow -type f -name '*.a' -delete && \
find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
/opt/conda/bin/conda clean -afy
# Copy Conda environment file
COPY models/gradient_boost/project/conda.yaml /opt/ml/code/conda.yaml
# Install Conda environment
RUN conda env create -f /opt/ml/code/conda.yaml
# Copy project files
ADD ml_ids /opt/ml/code/ml_ids
ADD models/gradient_boost/project /opt/ml/code/models/gradient_boost/project
COPY setup.cfg /opt/ml/code/setup.cfg
COPY setup.py /opt/ml/code/setup.py
# Activate conda env
RUN echo "source activate ml-ids-gradient-boost-catboost" > ~/.bashrc
ENV PATH /opt/conda/envs/ml-ids-gradient-boost-catboost/bin:$PATH
# Copy train script and make it executable
COPY models/gradient_boost/envs/sagemaker/container/train.py /opt/ml/code/train
RUN chmod +x /opt/ml/code/train
ENV PATH="/opt/ml/code:${PATH}"
WORKDIR /opt/ml/code
================================================
FILE: models/gradient_boost/envs/sagemaker/container/train.py
================================================
#!/usr/bin/env python
import sys
import os
import json
import traceback
import uuid
import mlflow
prefix = '/opt/ml/'
output_path = os.path.join(prefix, 'output')
model_path = os.path.join(prefix, 'model')
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')
input_path = prefix + 'input/data'
training_path = os.path.join(input_path, 'training')
validation_path = os.path.join(input_path, 'validation')
testing_path = os.path.join(input_path, 'testing')
mlflow_project_uri = os.path.join(prefix, 'code/models/gradient_boost/project')
mlflow_out_path = os.path.join('/tmp', str(uuid.uuid4()))
def merge(dict1, dict2):
d = dict(dict1)
d.update(dict2)
return d
if __name__ == '__main__':
print('Starting the training')
try:
with open(param_path, 'r') as tc:
training_params = json.load(tc)
training_file_path = os.path.join(training_path, 'train.h5')
validation_file_path = os.path.join(validation_path, 'val.h5')
testing_file_path = os.path.join(testing_path, 'test.h5')
mlflow_params = merge(training_params, {
'train_path': training_file_path,
'val_path': validation_file_path,
'test_path': testing_file_path,
'output_path': mlflow_out_path,
'artifact_path': model_path
})
os.makedirs(mlflow_out_path, exist_ok=True)
mlflow.run(mlflow_project_uri, parameters=mlflow_params, use_conda=False)
print('Training complete.')
sys.exit(0)
except Exception as e:
# Write out an error file. This will be returned as the failureReason in the
# DescribeTrainingJob result.
trc = traceback.format_exc()
with open(os.path.join(output_path, 'failure'), 'w') as s:
s.write('Exception during training: ' + str(e) + '\n' + trc)
# Printing this causes the exception to be in the training job logs, as well.
print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
# A non-zero exit code causes the training job to be marked as Failed.
sys.exit(255)
================================================
FILE: models/gradient_boost/envs/sagemaker/scripts/build_image.sh
================================================
#!/usr/bin/env bash
image_name=$1
image_version=$2
if [ "$image_name" == "" ]
then
echo "Usage: $0 <image-name>"
exit 1
fi
if [ "$image_version" == "" ]
then
echo "Usage: $1 <image-version>"
exit 1
fi
fullname="${image_name}:${image_version}"
echo "Building image '${fullname}'"
docker build -f models/gradient_boost/envs/sagemaker/container/Dockerfile -t ${fullname} .
================================================
FILE: models/gradient_boost/envs/sagemaker/scripts/deploy.py
================================================
import click
import json
import boto3
import tarfile
import re
import logging
from mlflow import sagemaker
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def unpack(file):
"""
Unpacks compressed files of format `tar` and `tar.gz`.
:param file: Filename.
:return: None
"""
if file.endswith("tar.gz"):
tar = tarfile.open(file, "r:gz")
tar.extractall()
tar.close()
elif file.endswith("tar"):
tar = tarfile.open(file, "r:")
tar.extractall()
tar.close()
@click.command()
@click.option('--config-path', type=click.Path(exists=True), required=True,
help='Path to the config.')
@click.option('--job-id', type=str, required=True,
help='Unique ID of the training job. Model is retrieved from a subdirectory with this name.')
def deploy(config_path, job_id):
with open(config_path, 'r') as f:
config = json.load(f)
app_name = config['deploy']['app_name']
instance_type = config['deploy']['instance_type']
instance_count = config['deploy']['instance_count']
region = config['deploy']['region']
role = config['role']
model_name = config['model_name']
model_bucket = re.sub('s3://', '', config['model_bucket'])
model_artifact = config['model_artifact']
model_path = '{}/output/{}'.format(job_id, model_artifact)
logger.info('Deploying model with parameters '
'[app-name="{}", instance-type="{}", instance-count={}, region="{}", model-path="{}"]'
.format(app_name, instance_type, instance_count, region, model_path))
s3 = boto3.client('s3')
s3.download_file(model_bucket, model_path, model_artifact)
unpack(model_artifact)
sagemaker.deploy(app_name=app_name,
model_uri=model_name,
execution_role_arn=role,
region_name=region,
mode='replace',
instance_type=instance_type,
instance_count=instance_count)
if __name__ == '__main__':
deploy()
================================================
FILE: models/gradient_boost/envs/sagemaker/scripts/push_image_to_ecr.sh
================================================
#!/usr/bin/env bash
image_name=$1
image_version=$2
if [ "$image_name" == "" ]
then
echo "Usage: $0 <image-name>"
exit 1
fi
if [ "$image_version" == "" ]
then
echo "Usage: $1 <image-version>"
exit 1
fi
# Get the account number associated with the current IAM credentials
account=$(aws sts get-caller-identity --query Account --output text)
if [ $? -ne 0 ]
then
exit 255
fi
# Get the region defined in the current configuration (default to eu-west-1 if none defined)
region=$(aws configure get region)
region=${region:-eu-west-1}
fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image_name}:${image_version}"
# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${image_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
aws ecr create-repository --repository-name "${image_name}" > /dev/null
fi
# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)
# Build the docker image locally with the image name and then push it to ECR
# with the full name.
docker tag "${image_name}:${image_version}" ${fullname}
docker push ${fullname}
echo "image-name=${fullname}"
================================================
FILE: models/gradient_boost/envs/sagemaker/scripts/train.py
================================================
import json
import click
import logging
from sagemaker.estimator import Estimator
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def create_performance_metric_regex(id):
"""
Creates the regex for a single performance metric.
Format: metric_name: 0.12345
:param id: Metric identifier.
:return: Regex
"""
return rf'{id}:\s*([\d.]*)'
def create_metric_def(name, regex):
"""
Creates a metric definition for a single metric.
:param name: Metric name.
:param regex: Metric regex.
:return: Metric definition as a `dict`.
"""
return {'Name': name, 'Regex': regex}
def get_metric_definitions():
"""
Creates the definitions for all metrics to monitor.
:return: Metric definitions as a `list`.
"""
return [create_metric_def('train:loss', create_performance_metric_regex('learn')),
create_metric_def('val:loss', create_performance_metric_regex('test')),
create_metric_def('val:loss:best', r'bestTest\s=\s([\d.]*)'),
create_metric_def('test:pr_auc', create_performance_metric_regex('pr_auc')),
create_metric_def('test:precision', create_performance_metric_regex('precision')),
create_metric_def('test:recall', create_performance_metric_regex('recall')),
create_metric_def('test:f1', create_performance_metric_regex('f1'))]
@click.command()
@click.option('--config-path', type=click.Path(exists=True), required=True,
help='Path to the config.')
@click.option('--param-path', type=click.Path(exists=True), required=True,
help='Path to the training parameters.')
@click.option('--image-name', type=str, required=True,
help='Name of the training image')
@click.option('--mode', type=click.Choice(['LOCAL', 'AWS'], case_sensitive=False), default='LOCAL',
help='Training mode.')
@click.option('--job-id', type=str, required=True,
help='Unique ID of the training job. Model outputs will be stored in a subdirectory with this name.')
def train(config_path, param_path, image_name, mode, job_id):
with open(config_path, 'r') as f:
config = json.load(f)
with open(param_path, 'r') as f:
params = json.load(f)
if mode == 'LOCAL':
train_instance_type = 'local'
params['task_type'] = 'CPU'
else:
train_instance_type = config['train']['instance_type']
params['task_type'] = config['train']['task_type']
train_instance_count = config['train']['instance_count']
role = config['role']
model_bucket = config['model_bucket']
logger.info('Start training with parameters '
'[job-id="{}", image="{}", mode="{}", instance_type="{}", instance_count={}, params={}]'
.format(job_id, image_name, mode, train_instance_type, train_instance_count, params))
estimator = Estimator(image_name=image_name,
role=role,
train_instance_count=train_instance_count,
train_instance_type=train_instance_type,
hyperparameters=params,
output_path=model_bucket,
metric_definitions=get_metric_definitions(),
train_max_run=(2 * 60 * 60))
estimator.fit(job_name=job_id,
inputs={
'training': config['data']['train'],
'validation': config['data']['val'],
'testing': config['data']['test']
})
if __name__ == '__main__':
train()
================================================
FILE: models/gradient_boost/envs/sagemaker/scripts/undeploy.py
================================================
import click
import json
from mlflow import sagemaker
@click.command()
@click.option('--config-path', type=click.Path(exists=True), required=True,
help='Path to the config.')
def undeploy(config_path):
with open(config_path, 'r') as f:
config = json.load(f)
app_name = config['deploy']['app_name']
region = config['deploy']['region']
sagemaker.delete(app_name=app_name, region_name=region)
if __name__ == '__main__':
undeploy()
================================================
FILE: models/gradient_boost/project/MLproject
================================================
name: gradient_boost_model
conda_env: conda.yaml
entry_points:
main:
parameters:
train_path: path
val_path: path
test_path: path
output_path: path
artifact_path: path
use_val_set: {type: bool, default: True}
nr_iterations: {type: int, default: 1000}
tree_depth: {type: int, default: 6}
l2_reg: {type: float, default: 3.0}
border_count: {type: int, default: 254}
random_strength: {type: int, default: 1}
task_type: {type: str, default: 'GPU'}
nr_samples_attack_category: {type: int, default: 1000}
random_seed: {type: int, default: -1}
command: "pip install -e ../../../. &&
python train.py --train-path {train_path}
--val-path {val_path}
--test-path {test_path}
--output-path {output_path}
--artifact-path {artifact_path}
--use-val-set {use_val_set}
--random-seed {random_seed}
--nr-iterations {nr_iterations}
--tree-depth {tree_depth}
--l2-reg {l2_reg}
--border-count {border_count}
--random-strength {random_strength}
--task-type {task_type}
--nr-samples-attack-category {nr_samples_attack_category}"
================================================
FILE: models/gradient_boost/project/conda.yaml
================================================
name: ml-ids-gradient-boost-catboost
channels:
- anaconda
- conda-forge
- defaults
dependencies:
- python=3.7
- pip=19.2.3=py37_0
- pandas=0.25.2=py37hb3f55d8_0
- catboost=0.18.1=py37_0
- imbalanced-learn=0.5.0=py_0
- scikit-learn=0.21.3=py37hcdab131_0
- scipy=1.3.1=py37h921218d_2
- click=7.0=py37_0
- cloudpickle=1.2.2=py_0
- pip:
- tables==3.6.1
- keras==2.2.4
- mlflow==1.4
================================================
FILE: models/gradient_boost/project/train.py
================================================
import click
import logging
import mlflow
import mlflow.pyfunc
import pickle
import os
import shutil
from catboost import Pool
from ml_ids.data.dataset import load_dataset_hdf
from ml_ids.data.metadata import FEATURES_NO_VARIANCE, FEATURES_TO_IGNORE, FEATURES_PRESERVE_NEG_COLUMNS
from ml_ids.prediction import predict_proba_positive
from ml_ids.model_selection import split_x_y
from ml_ids.models.gradient_boost.train import train_model, GradientBoostHyperParams
from ml_ids.models.gradient_boost.mlflow_wrapper import CatBoostWrapper
from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def load_dataset(path):
"""
Loads a single dataset in `hdf` format.
:param path: Dataset path.
:return: Pandas DataFrame.
"""
return load_dataset_hdf(dataset_path=path,
omit_cols=FEATURES_NO_VARIANCE + FEATURES_TO_IGNORE,
preserve_neg_value_cols=FEATURES_PRESERVE_NEG_COLUMNS)
def load_train_val_test_dataset(train_path, val_path, test_path):
"""
Loads the train, validation and test datasets.
:param train_path: Path to the train dataset.
:param val_path: Path to the validation dataset.
:param test_path: Path to the test dataset.
:return: the `Tuple(train, val, test)` containing Pandas DataFrames.
"""
return load_dataset(train_path), load_dataset(val_path), load_dataset(test_path)
def measure_performance(clf, pipeline, dataset):
"""
Measures performance metrics on the given dataset.
:param clf: Classifier to test.
:param pipeline: Preprocessing pipeline.
:param dataset: Dataset.
:return: the `Tuple(pr_auc, precision, recall, f1)`.
"""
X, y = split_x_y(dataset)
X = pipeline.transform(X)
pool = Pool(X)
y_true = y.label_is_attack
pred_proba = predict_proba_positive(clf, pool)
pred = clf.predict(pool)
pr_auc = average_precision_score(y_true, pred_proba)
precision = precision_score(y_true, pred)
recall = recall_score(y_true, pred)
f1 = f1_score(y_true, pred)
return pr_auc, precision, recall, f1
def save_artifacts(cbm_model_path, classifier, pipeline_path, pipeline, col_config_path, column_config):
"""
Save training artifacts to disk.
:param cbm_model_path: Path on disk where the classifier should be stored.
:param classifier: Classifier to store.
:param pipeline_path: Path on disk where the pipeline should be stored.
:param pipeline: Pipeline to store.
:param col_config_path: Path on disk where the config should be stored.
:param column_config: Column config to store.
:return: None
"""
classifier.save_model(cbm_model_path)
with open(pipeline_path, 'wb') as f:
pickle.dump(pipeline, f)
with open(col_config_path, 'wb') as f:
pickle.dump(column_config, f)
@click.command()
@click.option('--train-path', type=click.Path(exists=True), required=True,
help='Path to the train dataset in .h5 format.')
@click.option('--val-path', type=click.Path(exists=True), required=True,
help='Path to the train dataset in .h5 format.')
@click.option('--test-path', type=click.Path(exists=True), required=True,
help='Path to the train dataset in .h5 format.')
@click.option('--output-path', type=click.Path(exists=True), required=True,
help='Path to store the output.')
@click.option('--artifact-path', type=click.Path(exists=True), required=True,
help='Path to store the artifacts.')
@click.option('--use-val-set', type=bool, default=True,
help='Determines if the evaluation dataset should be used for early stopping of the training process.'
'If set to False the evaluation dataset will be appended to the train dataset.')
@click.option('--random-seed', type=int, default=None,
help='Random seed.')
@click.option('--nr-iterations', type=int, required=True)
@click.option('--tree-depth', type=int, required=True)
@click.option('--l2-reg', type=float, required=True)
@click.option('--border-count', type=int, required=True)
@click.option('--random-strength', type=int, required=True)
@click.option('--task-type', type=click.Choice(['CPU', 'GPU'], case_sensitive=False), required=True)
@click.option('--nr-samples-attack-category', type=int, required=True)
def train(train_path,
val_path,
test_path,
output_path,
artifact_path,
use_val_set,
random_seed,
nr_iterations,
tree_depth,
l2_reg,
border_count,
random_strength,
task_type,
nr_samples_attack_category):
shutil.rmtree(output_path, ignore_errors=True)
os.makedirs(output_path, exist_ok=True)
cbm_model_path = os.path.join(output_path, 'gradient_boost_model.cbm')
pipeline_path = os.path.join(output_path, 'preprocessing_pipeline.pkl')
col_config_path = os.path.join(output_path, 'column_config.pkl')
mlflow_model_path = os.path.join(artifact_path, 'ml-ids-gb_mlflow_pyfunc')
random_seed = None if random_seed == -1 else random_seed
logger.info('Loading datasets...')
train_dataset, val_dataset, test_dataset = load_train_val_test_dataset(train_path, val_path, test_path)
if not use_val_set:
logger.info('Evaluation dataset will not be used for early stopping. Merging with training dataset.')
train_dataset = train_dataset.append(val_dataset)
val_dataset = None
else:
logger.info('Evaluation dataset will be used for early stopping.')
hyper_params = GradientBoostHyperParams(nr_iterations=nr_iterations,
tree_depth=tree_depth,
l2_reg=l2_reg,
border_count=border_count,
random_strength=random_strength,
task_type=task_type)
with mlflow.start_run():
logger.info('Starting training...')
clf, pipeline, column_names = train_model(train_dataset,
val_dataset,
hyper_params=hyper_params,
nr_attack_samples=nr_samples_attack_category,
random_seed=random_seed)
pr_auc, precision, recall, f1 = measure_performance(clf, pipeline, test_dataset)
logger.info('Estimator performance:')
logger.info('pr_auc: %f', pr_auc)
logger.info('precision: %f', precision)
logger.info('recall: %f', recall)
logger.info('f1: %f', f1)
save_artifacts(cbm_model_path,
clf,
pipeline_path,
pipeline,
col_config_path,
{
'col_names': column_names,
'preserve_neg_vals': FEATURES_PRESERVE_NEG_COLUMNS
})
mlflow.pyfunc.save_model(
path=mlflow_model_path,
python_model=CatBoostWrapper(),
artifacts={
'cbm_model': cbm_model_path,
'pipeline': pipeline_path,
'col_config': col_config_path
},
conda_env='conda.yaml',
code_path=['../../../ml_ids'])
logger.info('Training completed.')
if __name__ == '__main__':
train()
================================================
FILE: models/gradient_boost/training_params.json
================================================
{
"task_type": "GPU",
"use_val_set": true,
"nr_iterations": 2000,
"tree_depth": 10,
"l2_reg": 4.813919374945952,
"border_count": 254,
"random_strength": 5,
"nr_samples_attack_category": 100000
}
================================================
FILE: models/gradient_boost/training_params_quick_run.json
================================================
{
"task_type": "GPU",
"nr_iterations": 10,
"nr_samples_attack_category": 1000,
"random_seed": 42
}
================================================
FILE: notebooks/01_data-cleanup/data_cleanup.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Cleanup\n",
"\n",
"Before the CSE-CIC-IDS2018 dataset can be used for analysis and training the dataset has to be cleaned. In its raw format the dataset consists of ten individual csv files, each containing the recorded network traffic of a single day of operation, named after the day the traffic was recorded on.\n",
"\n",
"To conduct an initial analysis of the dataset a single file is loaded and dissected."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# set base path to the directory containing the csv files of the dataset\n",
"dataset_base_path = r'/path/to/dataset'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. Removing invalid rows"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/envs/spark/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"\n",
"file_path = os.path.join(dataset_base_path, 'Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv')\n",
"\n",
"df = pd.read_csv(file_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Analyzing the output of `read_csv` shows that for most of the columns pandas could not infer a datatype."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 331125 entries, 0 to 331124\n",
"Data columns (total 80 columns):\n",
"Dst Port 331125 non-null object\n",
"Protocol 331125 non-null object\n",
"Timestamp 331125 non-null object\n",
"Flow Duration 331125 non-null object\n",
"Tot Fwd Pkts 331125 non-null object\n",
"Tot Bwd Pkts 331125 non-null object\n",
"TotLen Fwd Pkts 331125 non-null object\n",
"TotLen Bwd Pkts 331125 non-null object\n",
"Fwd Pkt Len Max 331125 non-null object\n",
"Fwd Pkt Len Min 331125 non-null object\n",
"Fwd Pkt Len Mean 331125 non-null object\n",
"Fwd Pkt Len Std 331125 non-null object\n",
"Bwd Pkt Len Max 331125 non-null object\n",
"Bwd Pkt Len Min 331125 non-null object\n",
"Bwd Pkt Len Mean 331125 non-null object\n",
"Bwd Pkt Len Std 331125 non-null object\n",
"Flow Byts/s 329291 non-null object\n",
"Flow Pkts/s 331125 non-null object\n",
"Flow IAT Mean 331125 non-null object\n",
"Flow IAT Std 331125 non-null object\n",
"Flow IAT Max 331125 non-null object\n",
"Flow IAT Min 331125 non-null object\n",
"Fwd IAT Tot 331125 non-null object\n",
"Fwd IAT Mean 331125 non-null object\n",
"Fwd IAT Std 331125 non-null object\n",
"Fwd IAT Max 331125 non-null object\n",
"Fwd IAT Min 331125 non-null object\n",
"Bwd IAT Tot 331125 non-null object\n",
"Bwd IAT Mean 331125 non-null object\n",
"Bwd IAT Std 331125 non-null object\n",
"Bwd IAT Max 331125 non-null object\n",
"Bwd IAT Min 331125 non-null object\n",
"Fwd PSH Flags 331125 non-null object\n",
"Bwd PSH Flags 331125 non-null object\n",
"Fwd URG Flags 331125 non-null object\n",
"Bwd URG Flags 331125 non-null object\n",
"Fwd Header Len 331125 non-null object\n",
"Bwd Header Len 331125 non-null object\n",
"Fwd Pkts/s 331125 non-null object\n",
"Bwd Pkts/s 331125 non-null object\n",
"Pkt Len Min 331125 non-null object\n",
"Pkt Len Max 331125 non-null object\n",
"Pkt Len Mean 331125 non-null object\n",
"Pkt Len Std 331125 non-null object\n",
"Pkt Len Var 331125 non-null object\n",
"FIN Flag Cnt 331125 non-null object\n",
"SYN Flag Cnt 331125 non-null object\n",
"RST Flag Cnt 331125 non-null object\n",
"PSH Flag Cnt 331125 non-null object\n",
"ACK Flag Cnt 331125 non-null object\n",
"URG Flag Cnt 331125 non-null object\n",
"CWE Flag Count 331125 non-null object\n",
"ECE Flag Cnt 331125 non-null object\n",
"Down/Up Ratio 331125 non-null object\n",
"Pkt Size Avg 331125 non-null object\n",
"Fwd Seg Size Avg 331125 non-null object\n",
"Bwd Seg Size Avg 331125 non-null object\n",
"Fwd Byts/b Avg 331125 non-null object\n",
"Fwd Pkts/b Avg 331125 non-null object\n",
"Fwd Blk Rate Avg 331125 non-null object\n",
"Bwd Byts/b Avg 331125 non-null object\n",
"Bwd Pkts/b Avg 331125 non-null object\n",
"Bwd Blk Rate Avg 331125 non-null object\n",
"Subflow Fwd Pkts 331125 non-null object\n",
"Subflow Fwd Byts 331125 non-null object\n",
"Subflow Bwd Pkts 331125 non-null object\n",
"Subflow Bwd Byts 331125 non-null object\n",
"Init Fwd Win Byts 331125 non-null object\n",
"Init Bwd Win Byts 331125 non-null object\n",
"Fwd Act Data Pkts 331125 non-null object\n",
"Fwd Seg Size Min 331125 non-null object\n",
"Active Mean 331125 non-null object\n",
"Active Std 331125 non-null object\n",
"Active Max 331125 non-null object\n",
"Active Min 331125 non-null object\n",
"Idle Mean 331125 non-null object\n",
"Idle Std 331125 non-null object\n",
"Idle Max 331125 non-null object\n",
"Idle Min 331125 non-null object\n",
"Label 331125 non-null object\n",
"dtypes: object(80)\n",
"memory usage: 202.1+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Querying the `info()` method of the dataframe shows that pandas inferred all columns as `object` columns as opposed to numerical columns which would be appropriate for most of them.\n",
"In order to understand why the columns are interpreted as `object`s low cardinality columns are analyzed to show individual values."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6 170066\n",
"17 95674\n",
"6 42833\n",
"17 15378\n",
"0 4596\n",
"0 2553\n",
"Protocol 25\n",
"Name: Protocol, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Protocol'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 268629\n",
"0 60520\n",
"1 1707\n",
"1 244\n",
"FIN Flag Cnt 25\n",
"Name: FIN Flag Cnt, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['FIN Flag Cnt'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The unique values indicate the existence of the column name as values in the dataset. \n",
"A visual examination of the input file confirms that the headers are present multiple times within the file, interweaved with the raw data rows. This suggests that a single file was created by concatenating mulitple csv files duplicating the headers in the process.\n",
"To fix this issue all columns containing the headers are removed from the dataframe."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df = df[~df['Dst Port'].str.contains('Dst Port', na=False)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the next steps the dataframe is exported to a temporary csv file in order to read it again with the correct column datatypes. \n",
"Furthermore the column names are converted to lowercase with non-word characters removed for easier access of the columns."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"tmp_path = os.path.join(dataset_base_path, 'tmp')\n",
"\n",
"if not os.path.exists(tmp_path):\n",
" os.mkdir(tmp_path)\n",
"\n",
"column_name_regex = re.compile(r\"\\W\", re.IGNORECASE)\n",
"df.columns = [column_name_regex.sub('_', c.lower()) for c in df.columns]\n",
"\n",
"tmp_file_path = os.path.join(tmp_path, 'Thursday-01-03-2018_TrafficForML_CICFlowMeter_duplicate_headers_removed.csv')\n",
"\n",
"df.to_csv(tmp_file_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. Removing invalid values\n",
"\n",
"Now the temporary file is loaded with the following datatype definitions. "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"types = {\n",
" 'dst_port': 'uint32',\n",
" 'protocol': 'uint8',\n",
" 'timestamp': 'object',\n",
" 'flow_duration': 'int64',\n",
" 'tot_fwd_pkts': 'uint32',\n",
" 'tot_bwd_pkts': 'uint32',\n",
" 'totlen_fwd_pkts': 'uint32',\n",
" 'totlen_bwd_pkts': 'uint32',\n",
" 'fwd_pkt_len_max': 'uint16',\n",
" 'fwd_pkt_len_min': 'uint16',\n",
" 'fwd_pkt_len_mean': 'float32',\n",
" 'fwd_pkt_len_std': 'float32',\n",
" 'bwd_pkt_len_max': 'uint16',\n",
" 'bwd_pkt_len_min': 'uint16',\n",
" 'bwd_pkt_len_mean': 'float32',\n",
" 'bwd_pkt_len_std': 'float32',\n",
" 'flow_byts_s': 'float64',\n",
" 'flow_pkts_s': 'float64',\n",
" 'flow_iat_mean': 'float32',\n",
" 'flow_iat_std': 'float32',\n",
" 'flow_iat_max': 'int64',\n",
" 'flow_iat_min': 'int64',\n",
" 'fwd_iat_tot': 'int64',\n",
" 'fwd_iat_mean': 'float32',\n",
" 'fwd_iat_std': 'float32',\n",
" 'fwd_iat_max': 'int64',\n",
" 'fwd_iat_min': 'int64',\n",
" 'bwd_iat_tot': 'uint32',\n",
" 'bwd_iat_mean': 'float32',\n",
" 'bwd_iat_std': 'float32',\n",
" 'bwd_iat_max': 'uint32',\n",
" 'bwd_iat_min': 'uint32',\n",
" 'fwd_psh_flags': 'uint8',\n",
" 'bwd_psh_flags': 'uint8',\n",
" 'fwd_urg_flags': 'uint8',\n",
" 'bwd_urg_flags': 'uint8',\n",
" 'fwd_header_len': 'uint32',\n",
" 'bwd_header_len': 'uint32',\n",
" 'flow_byts_s': 'float32',\n",
" 'bwd_pkts_s': 'float32',\n",
" 'pkt_len_min': 'uint16',\n",
" 'pkt_len_max': 'uint16',\n",
" 'pkt_len_mean': 'float32',\n",
" 'pkt_len_std': 'float32',\n",
" 'pkt_len_var': 'float32',\n",
" 'fin_flag_cnt': 'uint8',\n",
" 'syn_flag_cnt': 'uint8',\n",
" 'rst_flag_cnt': 'uint8',\n",
" 'psh_flag_cnt': 'uint8',\n",
" 'ack_flag_cnt': 'uint8',\n",
" 'urg_flag_cnt': 'uint8',\n",
" 'cwe_flag_count': 'uint8',\n",
" 'ece_flag_cnt': 'uint8',\n",
" 'down_up_ratio': 'uint16',\n",
" 'pkt_size_avg': 'float32',\n",
" 'fwd_seg_size_avg': 'float32',\n",
" 'bwd_seg_size_avg': 'float32',\n",
" 'fwd_byts_b_avg': 'uint8',\n",
" 'fwd_pkts_b_avg': 'uint8',\n",
" 'fwd_blk_rate_avg': 'uint8',\n",
" 'bwd_byts_b_avg': 'uint8',\n",
" 'bwd_pkts_b_avg': 'uint8',\n",
" 'bwd_blk_rate_avg': 'uint8',\n",
" 'subflow_fwd_pkts': 'uint32',\n",
" 'subflow_fwd_byts': 'uint32',\n",
" 'subflow_bwd_pkts': 'uint32',\n",
" 'subflow_bwd_byts': 'uint32',\n",
" 'init_fwd_win_byts': 'int32',\n",
" 'init_bwd_win_byts': 'int32',\n",
" 'fwd_act_data_pkts': 'uint32',\n",
" 'fwd_seg_size_min': 'uint8',\n",
" 'active_mean': 'float32',\n",
" 'active_std': 'float32',\n",
" 'active_max': 'uint32',\n",
" 'active_min': 'uint32',\n",
" 'idle_mean': 'float32',\n",
" 'idle_std': 'float32',\n",
" 'idle_max': 'uint64',\n",
" 'idle_min': 'uint64',\n",
" 'label': 'category'\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "cannot safely convert passed user dtype of float32 for object dtyped data in column 17",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: Cannot cast array from dtype('O') to dtype('float32') according to the rule 'safe'",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-10-80eb2d87d528>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtmp_file_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtypes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/anaconda3/envs/spark/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 700\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 702\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 703\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 704\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/anaconda3/envs/spark/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 434\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 435\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 436\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 437\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/anaconda3/envs/spark/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1137\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1138\u001b[0m \u001b[0mnrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_validate_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'nrows'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1139\u001b[0;31m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;31m# May alter columns / col_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/anaconda3/envs/spark/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1993\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1994\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1995\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1996\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1997\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_column_data\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._convert_tokens\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: cannot safely convert passed user dtype of float32 for object dtyped data in column 17"
]
}
],
"source": [
"df = pd.read_csv(tmp_file_path, dtype=types)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The error indicates that column 17 (Flow Byts/s) cannot be parsed. Another visual examination of the file reveals the existence of the string `Infinity` in multiple rows of this column.\n",
"The `read_csv()` method of pandas is not able to correctly parse this value as it only recognizes the strings `inf/-inf` as a valid representation of infinity.\n",
"\n",
"To fix this problem all occurrences of `Infinity` are replaced by the string `inf`."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/envs/spark/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3057: DtypeWarning: Columns (17,18) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
}
],
"source": [
"df = pd.read_csv(tmp_file_path)\n",
"\n",
"df_infinity_fixed = df.replace('Infinity', 'inf')\n",
"\n",
"tmp_file_path_inf = os.path.join(tmp_path, 'Thursday-01-03-2018_TrafficForML_CICFlowMeter_infinity_fixed.csv')\n",
"\n",
"df_infinity_fixed.to_csv(tmp_file_path_inf)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After fixing the infinity values the file can successfully be loaded with the given datatypes."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(tmp_file_path_inf, dtype=types)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 331100 entries, 0 to 331099\n",
"Data columns (total 82 columns):\n",
"Unnamed: 0 331100 non-null int64\n",
"Unnamed: 0.1 331100 non-null int64\n",
"dst_port 331100 non-null uint32\n",
"protocol 331100 non-null uint8\n",
"timestamp 331100 non-null object\n",
"flow_duration 331100 non-null int64\n",
"tot_fwd_pkts 331100 non-null uint32\n",
"tot_bwd_pkts 331100 non-null uint32\n",
"totlen_fwd_pkts 331100 non-null uint32\n",
"totlen_bwd_pkts 331100 non-null uint32\n",
"fwd_pkt_len_max 331100 non-null uint16\n",
"fwd_pkt_len_min 331100 non-null uint16\n",
"fwd_pkt_len_mean 331100 non-null float32\n",
"fwd_pkt_len_std 331100 non-null float32\n",
"bwd_pkt_len_max 331100 non-null uint16\n",
"bwd_pkt_len_min 331100 non-null uint16\n",
"bwd_pkt_len_mean 331100 non-null float32\n",
"bwd_pkt_len_std 331100 non-null float32\n",
"flow_byts_s 329266 non-null float32\n",
"flow_pkts_s 331100 non-null float64\n",
"flow_iat_mean 331100 non-null float32\n",
"flow_iat_std 331100 non-null float32\n",
"flow_iat_max 331100 non-null int64\n",
"flow_iat_min 331100 non-null int64\n",
"fwd_iat_tot 331100 non-null int64\n",
"fwd_iat_mean 331100 non-null float32\n",
"fwd_iat_std 331100 non-null float32\n",
"fwd_iat_max 331100 non-null int64\n",
"fwd_iat_min 331100 non-null int64\n",
"bwd_iat_tot 331100 non-null uint32\n",
"bwd_iat_mean 331100 non-null float32\n",
"bwd_iat_std 331100 non-null float32\n",
"bwd_iat_max 331100 non-null uint32\n",
"bwd_iat_min 331100 non-null uint32\n",
"fwd_psh_flags 331100 non-null uint8\n",
"bwd_psh_flags 331100 non-null uint8\n",
"fwd_urg_flags 331100 non-null uint8\n",
"bwd_urg_flags 331100 non-null uint8\n",
"fwd_header_len 331100 non-null uint32\n",
"bwd_header_len 331100 non-null uint32\n",
"fwd_pkts_s 331100 non-null float64\n",
"bwd_pkts_s 331100 non-null float32\n",
"pkt_len_min 331100 non-null uint16\n",
"pkt_len_max 331100 non-null uint16\n",
"pkt_len_mean 331100 non-null float32\n",
"pkt_len_std 331100 non-null float32\n",
"pkt_len_var 331100 non-null float32\n",
"fin_flag_cnt 331100 non-null uint8\n",
"syn_flag_cnt 331100 non-null uint8\n",
"rst_flag_cnt 331100 non-null uint8\n",
"psh_flag_cnt 331100 non-null uint8\n",
"ack_flag_cnt 331100 non-null uint8\n",
"urg_flag_cnt 331100 non-null uint8\n",
"cwe_flag_count 331100 non-null uint8\n",
"ece_flag_cnt 331100 non-null uint8\n",
"down_up_ratio 331100 non-null uint16\n",
"pkt_size_avg 331100 non-null float32\n",
"fwd_seg_size_avg 331100 non-null float32\n",
"bwd_seg_size_avg 331100 non-null float32\n",
"fwd_byts_b_avg 331100 non-null uint8\n",
"fwd_pkts_b_avg 331100 non-null uint8\n",
"fwd_blk_rate_avg 331100 non-null uint8\n",
"bwd_byts_b_avg 331100 non-null uint8\n",
"bwd_pkts_b_avg 331100 non-null uint8\n",
"bwd_blk_rate_avg 331100 non-null uint8\n",
"subflow_fwd_pkts 331100 non-null uint32\n",
"subflow_fwd_byts 331100 non-null uint32\n",
"subflow_bwd_pkts 331100 non-null uint32\n",
"subflow_bwd_byts 331100 non-null uint32\n",
"init_fwd_win_byts 331100 non-null int32\n",
"init_bwd_win_byts 331100 non-null int32\n",
"fwd_act_data_pkts 331100 non-null uint32\n",
"fwd_seg_size_min 331100 non-null uint8\n",
"active_mean 331100 non-null float32\n",
"active_std 331100 non-null float32\n",
"active_max 331100 non-null uint32\n",
"active_min 331100 non-null uint32\n",
"idle_mean 331100 non-null float32\n",
"idle_std 331100 non-null float32\n",
"idle_max 331100 non-null uint64\n",
"idle_min 331100 non-null uint64\n",
"label 331100 non-null category\n",
"dtypes: category(1), float32(22), float64(2), int32(2), int64(8), object(1), uint16(7), uint32(17), uint64(2), uint8(20)\n",
"memory usage: 95.7+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Infinity values of flow_byts_s: 1085\n",
"Null values of flow_byts_s: 1834\n"
]
}
],
"source": [
"print(f\"Infinity values of flow_byts_s: {df[df['flow_byts_s'] == np.inf]['dst_port'].count()}\")\n",
"print(f\"Null values of flow_byts_s: {df[df['flow_byts_s'].isnull()]['dst_port'].count()}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3. Cleanup Script\n",
"\n",
"In summary the following clean-up steps must be applied to all files of the dataset:\n",
"1. Removal of duplicate headers contained as rows of the dataset.\n",
"2. Substitution of occurrences of `Infinity` with `inf`\n",
"3. Renaming the column names to remove whitespaces and non-word characters\n",
"\n",
"The following script processes all files of the dataset and stores the output files using a name describing the attack types of the flows contained in the files rather than the date of the flows.\n",
"\n",
"Remark: The file `Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv` contains four columns (`Flow ID`, `Src IP`, `Dst IP`, `Src Port`) not present in any of the other files. As those columns are not required they are dropped upon loading the file."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import re\n",
"\n",
"csv_files = {\n",
" 'Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv': 'infiltration_28-02-2018.csv',\n",
" 'Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv': 'infiltration_01-03-2018.csv',\n",
" 'Friday-02-03-2018_TrafficForML_CICFlowMeter.csv': 'bot_02-03-2018.csv',\n",
" 'Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv': 'bruteforce-web-xss_sql-injection_22-02-2018.csv',\n",
" 'Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv': 'dos-goldeneye-slowloris_15-02-2018.csv',\n",
" 'Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv': 'ddos-loic-http-loic-udp_20-02-2018.csv',\n",
" 'Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv': 'ddos-loic-udp_hoic_21-02-2018.csv',\n",
" 'Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv': 'bruteforce-ftp-ssh_14-02-2018.csv',\n",
" 'Friday-16-02-2018_TrafficForML_CICFlowMeter.csv': 'dos-slowhttp-hulk_16-02-2018.csv',\n",
" 'Friday-23-02-2018_TrafficForML_CICFlowMeter.csv': 'bruteforce-web-xss_sql-injection_23-02-2018.csv'\n",
"}\n",
"\n",
"column_name_regex = re.compile(r\"\\W\", re.IGNORECASE)\n",
"processed_dir = 'processed'\n",
"processed_path = os.path.join(dataset_base_path, processed_dir)\n",
"\n",
"def remove_headers(f): \n",
" return f[~f['Dst Port'].str.contains('Dst Port', na=False)]\n",
"\n",
"def replace_infinity(f):\n",
" return f.replace('Infinity', 'inf', inplace=True)\n",
"\n",
"def remove_non_word_chars_from_column_names(f):\n",
" return [column_name_regex.sub('_', c.lower()) for c in df.columns]\n",
" \n",
"if not os.path.exists(processed_path):\n",
" os.mkdir(processed_path) \n",
" \n",
"for f, out in csv_files.items():\n",
" file_path = os.path.join(dataset_base_path, f)\n",
" output_path = os.path.join(dataset_base_path, processed_dir, out)\n",
" \n",
" df = pd.read_csv(file_path, dtype=str).drop(columns=['Flow ID', 'Src IP', 'Dst IP', 'Src Port'], errors='ignore')\n",
" df = remove_headers(df)\n",
" replace_infinity(df)\n",
" df.columns = remove_non_word_chars_from_column_names(df)\n",
" df.to_csv(output_path, index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: notebooks/02_exploratory-data-analysis/exploratory_data_analysis.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exploratory Data Analysis\n",
"\n",
"This notebook conducts an exploratory data analysis of the CSE-CIC-IDS2018 dataset. The notebook aims to answer the following questions:\n",
"1. How many benign and malicious network flows are contained in the dataset?\n",
"2. How many network flows are contained per attack type?\n",
"3. Is there a strong correlation between certain features?\n",
"4. Which features show a correlation with the binary class of a network flow?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dataset Preparation\n",
"\n",
"As a first step the dataset must be loaded and prepared for analysis. The following code imports all necessary libraries and loads the full dataset into a Pandas dataframe."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import glob\n",
"import os\n",
"from scipy import stats\n",
"from scipy.stats import ks_2samp\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# set base path to the directory containing the csv files of the dataset\n",
"dataset_base_path = r'/path/to/dataset'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"types = {\n",
" 'dst_port': 'uint32',\n",
" 'protocol': 'uint8',\n",
" 'timestamp': 'object',\n",
" 'flow_duration': 'int64',\n",
" 'tot_fwd_pkts': 'uint32',\n",
" 'tot_bwd_pkts': 'uint32',\n",
" 'totlen_fwd_pkts': 'uint32',\n",
" 'totlen_bwd_pkts': 'uint32',\n",
" 'fwd_pkt_len_max': 'uint16',\n",
" 'fwd_pkt_len_min': 'uint16',\n",
" 'fwd_pkt_len_mean': 'float32',\n",
" 'fwd_pkt_len_std': 'float32',\n",
" 'bwd_pkt_len_max': 'uint16',\n",
" 'bwd_pkt_len_min': 'uint16',\n",
" 'bwd_pkt_len_mean': 'float32',\n",
" 'bwd_pkt_len_std': 'float32',\n",
" 'flow_byts_s': 'float64',\n",
" 'flow_pkts_s': 'float64',\n",
" 'flow_iat_mean': 'float32',\n",
" 'flow_iat_std': 'float32',\n",
" 'flow_iat_max': 'int64',\n",
" 'flow_iat_min': 'int64',\n",
" 'fwd_iat_tot': 'int64',\n",
" 'fwd_iat_mean': 'float32',\n",
" 'fwd_iat_std': 'float32',\n",
" 'fwd_iat_max': 'int64',\n",
" 'fwd_iat_min': 'int64',\n",
" 'bwd_iat_tot': 'uint32',\n",
" 'bwd_iat_mean': 'float32',\n",
" 'bwd_iat_std': 'float32',\n",
" 'bwd_iat_max': 'uint32',\n",
" 'bwd_iat_min': 'uint32',\n",
" 'fwd_psh_flags': 'uint8',\n",
" 'bwd_psh_flags': 'uint8',\n",
" 'fwd_urg_flags': 'uint8',\n",
" 'bwd_urg_flags': 'uint8',\n",
" 'fwd_header_len': 'uint32',\n",
" 'bwd_header_len': 'uint32',\n",
" 'fwd_pkts_s': 'float32',\n",
" 'bwd_pkts_s': 'float32',\n",
" 'pkt_len_min': 'uint16',\n",
" 'pkt_len_max': 'uint16',\n",
" 'pkt_len_mean': 'float32',\n",
" 'pkt_len_std': 'float32',\n",
" 'pkt_len_var': 'float32',\n",
" 'fin_flag_cnt': 'uint8',\n",
" 'syn_flag_cnt': 'uint8',\n",
" 'rst_flag_cnt': 'uint8',\n",
" 'psh_flag_cnt': 'uint8',\n",
" 'ack_flag_cnt': 'uint8',\n",
" 'urg_flag_cnt': 'uint8',\n",
" 'cwe_flag_count': 'uint8',\n",
" 'ece_flag_cnt': 'uint8',\n",
" 'down_up_ratio': 'uint16',\n",
" 'pkt_size_avg': 'float32',\n",
" 'fwd_seg_size_avg': 'float32',\n",
" 'bwd_seg_size_avg': 'float32',\n",
" 'fwd_byts_b_avg': 'uint8',\n",
" 'fwd_pkts_b_avg': 'uint8',\n",
" 'fwd_blk_rate_avg': 'uint8',\n",
" 'bwd_byts_b_avg': 'uint8',\n",
" 'bwd_pkts_b_avg': 'uint8',\n",
" 'bwd_blk_rate_avg': 'uint8',\n",
" 'subflow_fwd_pkts': 'uint32',\n",
" 'subflow_fwd_byts': 'uint32',\n",
" 'subflow_bwd_pkts': 'uint32',\n",
" 'subflow_bwd_byts': 'uint32',\n",
" 'init_fwd_win_byts': 'int32',\n",
" 'init_bwd_win_byts': 'int32',\n",
" 'fwd_act_data_pkts': 'uint32',\n",
" 'fwd_seg_size_min': 'uint8',\n",
" 'active_mean': 'float32',\n",
" 'active_std': 'float32',\n",
" 'active_max': 'uint32',\n",
" 'active_min': 'uint32',\n",
" 'idle_mean': 'float32',\n",
" 'idle_std': 'float32',\n",
" 'idle_max': 'uint64',\n",
" 'idle_min': 'uint64',\n",
" 'label': 'category'\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"csv_files = glob.glob(os.path.join(dataset_base_path, '*.csv'))\n",
"\n",
"df = pd.concat((pd.read_csv(f, dtype=types) for f in csv_files))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Filling missing values\n",
"\n",
"The dataset contains `inf` values which should be substituted as they cannot be used effectively in calculations. For this purpose all columns containing `inf` values are identified."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Columns containing infinity values ['flow_byts_s', 'flow_pkts_s']\n"
]
}
],
"source": [
"inf_columns = [c for c in df.columns if df[df[c] == np.inf][c].count() > 0]\n",
"\n",
"print(f'Columns containing infinity values {inf_columns}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The two columns containing `inf` values are `flow_byts_s` and `flow_pkts_s`. \n",
"The `inf` values are replaced with `nan` values, subsequently imputing all `nan` values of those columns with the mean of the columns."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"for col in inf_columns:\n",
" df[col].replace([np.inf, -np.inf], np.nan, inplace=True)\n",
" mean = df[col].mean()\n",
" df[col].fillna(mean, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Label creation\n",
"\n",
"To analyze the dataset in terms of binary classification (benign/attack) and multi-classification (benign/attack-type) addtional label columns are created:\n",
"1. `label_is_attack` specifies if a network flow represents a benign or an malicious flow.\n",
"2. `label_is_attack_[attack_type]` specifies if a network flow represents a certain type of attack."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df['label'] = df.label.astype('category')\n",
"df['label_code'] = df['label'].cat.codes\n",
"df['label_is_attack'] = df.label.apply(lambda x: 0 if x == 'Benign' else 1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"attack_types = [a for a in df.label.value_counts().index.tolist() if a != 'Benign']\n",
"\n",
"for a in attack_types:\n",
" l = 'label_is_attack_' + a.replace('-', ' ').replace(' ', '_').lower()\n",
" df[l] = df.label.apply(lambda x: 1 if x == a else 0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Dataset Overview\n",
"\n",
"The basic information of the dataset reveals that the whole dataset consists of *16232943* network flows and that all `null` values were eliminated."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 16232943 entries, 0 to 1048574\n",
"Data columns (total 96 columns):\n",
"dst_port 16232943 non-null uint32\n",
"protocol 16232943 non-null uint8\n",
"timestamp 16232943 non-null object\n",
"flow_duration 16232943 non-null int64\n",
"tot_fwd_pkts 16232943 non-null uint32\n",
"tot_bwd_pkts 16232943 non-null uint32\n",
"totlen_fwd_pkts 16232943 non-null uint32\n",
"totlen_bwd_pkts 16232943 non-null uint32\n",
"fwd_pkt_len_max 16232943 non-null uint16\n",
"fwd_pkt_len_min 16232943 non-null uint16\n",
"fwd_pkt_len_mean 16232943 non-null float32\n",
"fwd_pkt_len_std 16232943 non-null float32\n",
"bwd_pkt_len_max 16232943 non-null uint16\n",
"bwd_pkt_len_min 16232943 non-null uint16\n",
"bwd_pkt_len_mean 16232943 non-null float32\n",
"bwd_pkt_len_std 16232943 non-null float32\n",
"flow_byts_s 16232943 non-null float64\n",
"flow_pkts_s 16232943 non-null float64\n",
"flow_iat_mean 16232943 non-null float32\n",
"flow_iat_std 16232943 non-null float32\n",
"flow_iat_max 16232943 non-null int64\n",
"flow_iat_min 16232943 non-null int64\n",
"fwd_iat_tot 16232943 non-null int64\n",
"fwd_iat_mean 16232943 non-null float32\n",
"fwd_iat_std 16232943 non-null float32\n",
"fwd_iat_max 16232943 non-null int64\n",
"fwd_iat_min 16232943 non-null int64\n",
"bwd_iat_tot 16232943 non-null uint32\n",
"bwd_iat_mean 16232943 non-null float32\n",
"bwd_iat_std 16232943 non-null float32\n",
"bwd_iat_max 16232943 non-null uint32\n",
"bwd_iat_min 16232943 non-null uint32\n",
"fwd_psh_flags 16232943 non-null uint8\n",
"bwd_psh_flags 16232943 non-null uint8\n",
"fwd_urg_flags 16232943 non-null uint8\n",
"bwd_urg_flags 16232943 non-null uint8\n",
"fwd_header_len 16232943 non-null uint32\n",
"bwd_header_len 16232943 non-null uint32\n",
"fwd_pkts_s 16232943 non-null float32\n",
"bwd_pkts_s 16232943 non-null float32\n",
"pkt_len_min 16232943 non-null uint16\n",
"pkt_len_max 16232943 non-null uint16\n",
"pkt_len_mean 16232943 non-null float32\n",
"pkt_len_std 16232943 non-null float32\n",
"pkt_len_var 16232943 non-null float32\n",
"fin_flag_cnt 16232943 non-null uint8\n",
"syn_flag_cnt 16232943 non-null uint8\n",
"rst_flag_cnt 16232943 non-null uint8\n",
"psh_flag_cnt 16232943 non-null uint8\n",
"ack_flag_cnt 16232943 non-null uint8\n",
"urg_flag_cnt 16232943 non-null uint8\n",
"cwe_flag_count 16232943 non-null uint8\n",
"ece_flag_cnt 16232943 non-null uint8\n",
"down_up_ratio 16232943 non-null uint16\n",
"pkt_size_avg 16232943 non-null float32\n",
"fwd_seg_size_avg 16232943 non-null float32\n",
"bwd_seg_size_avg 16232943 non-null float32\n",
"fwd_byts_b_avg 16232943 non-null uint8\n",
"fwd_pkts_b_avg 16232943 non-null uint8\n",
"fwd_blk_rate_avg 16232943 non-null uint8\n",
"bwd_byts_b_avg 16232943 non-null uint8\n",
"bwd_pkts_b_avg 16232943 non-null uint8\n",
"bwd_blk_rate_avg 16232943 non-null uint8\n",
"subflow_fwd_pkts 16232943 non-null uint32\n",
"subflow_fwd_byts 16232943 non-null uint32\n",
"subflow_bwd_pkts 16232943 non-null uint32\n",
"subflow_bwd_byts 16232943 non-null uint32\n",
"init_fwd_win_byts 16232943 non-null int32\n",
"init_bwd_win_byts 16232943 non-null int32\n",
"fwd_act_data_pkts 16232943 non-null uint32\n",
"fwd_seg_size_min 16232943 non-null uint8\n",
"active_mean 16232943 non-null float32\n",
"active_std 16232943 non-null float32\n",
"active_max 16232943 non-null uint32\n",
"active_min 16232943 non-null uint32\n",
"idle_mean 16232943 non-null float32\n",
"idle_std 16232943 non-null float32\n",
"idle_max 16232943 non-null uint64\n",
"idle_min 16232943 non-null uint64\n",
"label 16232943 non-null category\n",
"label_code 16232943 non-null int8\n",
"label_is_attack 16232943 non-null int64\n",
"label_is_attack_ddos_attack_hoic 16232943 non-null int64\n",
"label_is_attack_ddos_attacks_loic_http 16232943 non-null int64\n",
"label_is_attack_dos_attacks_hulk 16232943 non-null int64\n",
"label_is_attack_bot 16232943 non-null int64\n",
"label_is_attack_ftp_bruteforce 16232943 non-null int64\n",
"label_is_attack_ssh_bruteforce 16232943 non-null int64\n",
"label_is_attack_infilteration 16232943 non-null int64\n",
"label_is_attack_dos_attacks_slowhttptest 16232943 non-null int64\n",
"label_is_attack_dos_attacks_goldeneye 16232943 non-null int64\n",
"label_is_attack_dos_attacks_slowloris 16232943 non-null int64\n",
"label_is_attack_ddos_attack_loic_udp 16232943 non-null int64\n",
"label_is_attack_brute_force__web 16232943 non-null int64\n",
"label_is_attack_brute_force__xss 16232943 non-null int64\n",
"label_is_attack_sql_injection 16232943 non-null int64\n",
"dtypes: category(1), float32(22), float64(2), int32(2), int64(21), int8(1), object(1), uint16(7), uint32(17), uint64(2), uint8(20)\n",
"memory usage: 6.3+ GB\n"
]
}
],
"source": [
"df.info(verbose=True, null_counts=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. Number of benign network flows in relation to malicious flows\n",
"\n",
"Plotting the numbers of benign network flows versus malicious network flows shows that the dataset is heavily skewed in favor of benign network flows which account for ~83% of all data."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x1aa6d26160>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAbkAAAFJCAYAAAAGxlMxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAToklEQVR4nO3de5CddX3H8feHBLwUbzXbqgkxSKk1RRRd0WpHwdo20JagQx0yeEHRtFPRaXWsOK3oYO2Mt7HWxtLUImItlGK1qY2i9YYVsSwKSKDYDFrZgUq8a61i7Ld/nLN6XDabs5t9cnZ/5/2a2Zl9Luc532Um8+Y5l+dJVSFJUosOGfUAkiR1xchJkppl5CRJzTJykqRmGTlJUrOMnCSpWSsyckkuSHJHkhuG2PdNSa7t/3w+yTcOxoySpNHLSvyeXJInAt8BLqqqYxbwuBcCx1XVczsbTpK0bKzIM7mqugL42uC6JEcl+UCSa5J8IskvzPHQLcDFB2VISdLIrR71AEtoO/C7VfWfSR4LvBV48szGJA8GjgQ+MqL5JEkHWRORS3I48HjgH5LMrL7brN1OBy6rqh8ezNkkSaPTROTovez6jap65Dz7nA684CDNI0laBlbke3KzVdW3gC8k+W2A9DxiZnuShwL3Az41ohElSSOwIiOX5GJ6wXpokukkZwFnAGcluQ7YBWweeMgW4JJaiR8llSQt2or8CoEkScNYkWdykiQNo7PIDXtVkiSPSfLDJKd1NYskaTx19nLlMFclSbIK+BDwPeCCqrpsf8dds2ZNbdiwYSlHlSStcNdcc81Xqmpi9vrOvkJQVVck2bCf3V4IvBt4zLDH3bBhA1NTUwcwmSSpNUn+a671I3tPLsla4KnA+aOaQZLUtlF+8OTPgJcNcwWSJFuTTCWZ2rNnz0EYTZLUglFe8WQSuKR/Ga41wMlJ9lbVe2fvWFXb6V2bksnJSb/zIEkaysgiV1VHzvye5ELgfXMFTpKkxeoscv2rkpwArEkyDbwSOBSgqnwfTpLUuS4/XbllAfue2dUckqTx5RVPJEnNMnKSpGYZOUlSs4ycJKlZRk6S1KxRfhl8WXj0Sy8a9QgaI9e8/lmjHkEaK57JSZKaZeQkSc0ycpKkZhk5SVKzjJwkqVlGTpLULCMnSWqWkZMkNcvISZKaZeQkSc0ycpKkZhk5SVKzjJwkqVlGTpLULCMnSWqWkZMkNcvISZKaZeQkSc0ycpKkZhk5SVKzjJwkqVlGTpLULCMnSWqWkZMkNauzyCW5IMkdSW7Yx/Yzklzf/7kyySO6mkWSNJ66PJO7ENg0z/YvAE+qqmOBVwPbO5xFkjSGVnd14Kq6IsmGebZfObB4FbCuq1kkSeNpubwndxbw/lEPIUlqS2dncsNKciK9yP3yPPtsBbYCrF+//iBNJkla6UZ6JpfkWOBtwOaq+uq+9quq7VU1WVWTExMTB29ASdKKNrLIJVkP/CPwzKr6/KjmkCS1q7OXK5NcDJwArEkyDbwSOBSgqs4HzgXuD7w1CcDeqprsah5J0vjp8tOVW/az/XnA87p6fkmSlsunKyVJWnJGTpLULCMnSWqWkZMkNcvISZKaZeQkSc0ycpKkZhk5SVKzjJwkqVlGTpLULCMnSWqWkZMkNcvISZKaZeQkSc0ycpKkZhk5SVKzjJwkqVlGTpLULCMnSWqWkZMkNcvISZKaZeQkSc0ycpKkZhk5SVKzjJwkqVlGTpLULCMnSWqWkZMkNcvISZKa1VnkklyQ5I4kN+xje5L8eZLdSa5P8qiuZpEkjacuz+QuBDbNs/0k4Oj+z1bgLzucRZI0hjqLXFVdAXxtnl02AxdVz1XAfZM8sKt5JEnjZ5Tvya0Fbh1Ynu6vkyRpSYwycpljXc25Y7I1yVSSqT179nQ8liSpFaOM3DRwxMDyOuC2uXasqu1VNVlVkxMTEwdlOEnSyjfKyO0AntX/lOXjgG9W1e0jnEeS1JjVXR04ycXACcCaJNPAK4FDAarqfGAncDKwG/gu8JyuZpEkjafOIldVW/azvYAXdPX8kiR5xRNJUrOMnCSpWUZOktQsIydJapaRkyQ1y8hJkppl5CRJzTJykqRmGTlJUrOMnCSpWUZOktQsIydJapaRkyQ1y8hJkppl5CRJzTJykqRmGTlJUrOMnCSpWUZOktQsIydJapaRkyQ1y8hJkppl5CRJzTJykqRmGTlJUrOMnCSpWUZOktQsIydJapaRkyQ1q9PIJdmU5OYku5OcM8f29Uk+muSzSa5PcnKX80iSxktnkUuyCtgGnARsBLYk2Thrtz8GLq2q44DTgbd2NY8kafx0eSZ3PLC7qm6pqjuBS4DNs/Yp4N793+8D3NbhPJKkMbO6w2OvBW4dWJ4GHjtrn1cBH0zyQuCngKd0OI8kacx0eSaXOdbVrOUtwIVVtQ44GXhnkrvMlGRrkqkkU3v27OlgVElSi7qM3DRwxMDyOu76cuRZwKUAVfUp4O7AmtkHqqrtVTVZVZMTExMdjStJak2XkbsaODrJkUkOo/fBkh2z9vkS8CsASR5GL3KeqkmSlkRnkauqvcDZwOXATfQ+RbkryXlJTunv9hLg+UmuAy4Gzqyq2S9pSpK0KF1+8ISq2gnsnLXu3IHfbwSe0OUMkqTx5RVPJEnNGipyST48zDpJkpaTeV+uTHJ34J7AmiT348dfC7g38KCOZ5Mk6YDs7z253wF+n17QruHHkfsWvUt2SZK0bM0buap6M/DmJC+sqrccpJkkSVoSQ326sqrekuTxwIbBx1TVRR3NJUnSARsqckneCRwFXAv8sL+6ACMnSVq2hv2e3CSw0S9qS5JWkmG/J3cD8IAuB5EkaakNeya3Brgxyb8D359ZWVWn7PshkiSN1rCRe1WXQ0iS1IVhP1358a4HkSRpqQ376cpv8+Mbnh4GHAr8T1Xdu6vBJEk6UMOeyd1rcDnJqcDxnUwkSdISWdRdCKrqvcCTl3gWSZKW1LAvVz5tYPEQet+b8ztzkqRlbdhPV/7WwO97gS8Cm5d8GkmSltCw78k9p+tBJElaasPeNHVdkvckuSPJl5O8O8m6roeTJOlADPvBk7cDO+jdV24t8M/9dZIkLVvDRm6iqt5eVXv7PxcCEx3OJUnSARs2cl9J8owkq/o/zwC+2uVgkiQdqGEj91zg6cB/A7cDpwF+GEWStKwN+xWCVwPPrqqvAyT5aeAN9OInSdKyNOyZ3LEzgQOoqq8Bx3UzkiRJS2PYyB2S5H4zC/0zuWHPAiVJGolhQ/VG4Mokl9G7nNfTgdd0NpUkSUtg2CueXJRkit5FmQM8rapu7HQySZIO0NAvOfajZtgkSSvGom61I0nSStBp5JJsSnJzkt1JztnHPk9PcmOSXUn+rst5JEnjpbNPSCZZBWwDfhWYBq5OsmPwvbwkRwMvB55QVV9P8jNdzSNJGj9dnskdD+yuqluq6k7gEu56D7rnA9tmvoNXVXd0OI8kacx0Gbm1wK0Dy9P9dYN+Hvj5JJ9MclWSTR3OI0kaM11+oTtzrKs5nv9o4ARgHfCJJMdU1Td+4kDJVmArwPr165d+UklSk7o8k5sGjhhYXgfcNsc+/1RVP6iqLwA304veT6iq7VU1WVWTExPe4UeSNJwuI3c1cHSSI5McBpxO78arg94LnAiQZA29ly9v6XAmSdIY6SxyVbUXOBu4HLgJuLSqdiU5L8kp/d0uB76a5Ebgo8BLq8r71EmSlkSnF1muqp3Azlnrzh34vYAX938kSVpSXvFEktQsIydJapaRkyQ1y8hJkppl5CRJzTJykqRmGTlJUrOMnCSpWUZOktQsIydJapaRkyQ1y8hJkppl5CRJzTJykqRmGTlJUrOMnCSpWUZOktQsIydJapaRkyQ1y8hJkppl5CRJzTJykqRmGTlJUrOMnCSpWUZOktQsIydJapaRkyQ1y8hJkppl5CRJzeo0ckk2Jbk5ye4k58yz32lJKslkl/NIksZLZ5FLsgrYBpwEbAS2JNk4x373Al4EfLqrWSRJ46nLM7njgd1VdUtV3QlcAmyeY79XA68DvtfhLJKkMdRl5NYCtw4sT/fX/UiS44Ajqup9Hc4hSRpTXUYuc6yrH21MDgHeBLxkvwdKtiaZSjK1Z8+eJRxRktSyLiM3DRwxsLwOuG1g+V7AMcDHknwReBywY64Pn1TV9qqarKrJiYmJDkeWJLWky8hdDRyd5MgkhwGnAztmNlbVN6tqTVVtqKoNwFXAKVU11eFMkqQx0lnkqmovcDZwOXATcGlV7UpyXpJTunpeSZJmrO7y4FW1E9g5a925+9j3hC5nkSSNH694IklqlpGTJDXLyEmSmmXkJEnNMnKSpGYZOUlSs4ycJKlZRk6S1CwjJ0lqlpGTJDXLyEmSmmXkJEnNMnKSpGYZOUlSs4ycJKlZRk6S1CwjJ0lqlpGTJDXLyEmSmrV61ANIWh6+dN7DRz2Cxsj6cz93UJ7HMzlJUrOMnCSpWUZOktQsIydJapaRkyQ1y8hJkppl5CRJzTJykqRmGTlJUrOMnCSpWZ1GLsmmJDcn2Z3knDm2vzjJjUmuT/LhJA/uch5J0njpLHJJVgHbgJOAjcCWJBtn7fZZYLKqjgUuA17X1TySpPHT5Znc8cDuqrqlqu4ELgE2D+5QVR+tqu/2F68C1nU4jyRpzHQZubXArQPL0/11+3IW8P4O55EkjZkub7WTOdbVnDsmzwAmgSftY/tWYCvA+vXrl2o+SVLjujyTmwaOGFheB9w2e6ckTwH+CDilqr4/14GqantVTVbV5MTERCfDSpLa02XkrgaOTnJkksOA04EdgzskOQ74K3qBu6PDWSRJY6izyFXVXuBs4HLgJuDSqtqV5Lwkp/R3ez1wOPAPSa5NsmMfh5MkacG6fE+OqtoJ7Jy17tyB35/S5fNLksabVzyRJDXLyEmSmmXkJEnNMnKSpGYZOUlSs4ycJKlZRk6S1CwjJ0lqlpGTJDXLyEmSmmXkJEnNMnKSpGYZOUlSs4ycJKlZRk6S1CwjJ0lqlpGTJDXLyEmSmmXkJEnNMnKSpGYZOUlSs4ycJKlZRk6S1CwjJ0lqlpGTJDXLyEmSmmXkJEnNMnKSpGYZOUlSszqNXJJNSW5OsjvJOXNsv1uSv+9v/3SSDV3OI0kaL51FLskqYBtwErAR2JJk46zdzgK+XlU/B7wJeG1X80iSxk+XZ3LHA7ur6paquhO4BNg8a5/NwDv6v18G/EqSdDiTJGmMdBm5tcCtA8vT/XVz7lNVe4FvAvfvcCZJ0hhZ3eGx5zojq0XsQ5KtwNb+4neS3HyAs+nArQG+MuohVpq84dmjHkFLz38Li/HKJX/R7sFzrewyctPAEQPL64Db9rHPdJLVwH2Ar80+UFVtB7Z3NKcWIclUVU2Oeg5p1Py3sLx1+XLl1cDRSY5MchhwOrBj1j47gJn/tT0N+EhV3eVMTpKkxejsTK6q9iY5G7gcWAVcUFW7kpwHTFXVDuBvgHcm2U3vDO70ruaRJI2feOKkxUiytf8ysjTW/LewvBk5SVKzvKyXJKlZRk4Lsr9LtUnjIskFSe5IcsOoZ9G+GTkNbchLtUnj4kJg06iH0PyMnBZimEu1SWOhqq5gju/1ankxclqIYS7VJknLhpHTQgx1GTZJWi6MnBZimEu1SdKyYeS0EMNcqk2Slg0jp6H1b4c0c6m2m4BLq2rXaKeSRiPJxcCngIcmmU5y1qhn0l15xRNJUrM8k5MkNcvISZKaZeQkSc0ycpKkZhk5SVKzjJwkqVlGTlqAJN/Zz/YNC731SpILk5w2z/a3LdXdHpKcOnisJGcmedAij3VCkvctxVxSV4yctMxV1fOq6sYlOtyp9G6TNONMYFGRk1YCIyctQpLDk3w4yWeSfC7J4C2HVid5R5Lrk1yW5J79xzw6yceTXJPk8iQPHPK5PpZkMsmq/lnfDf3n/IN5HvP8JFcnuS7Ju5PcM8njgVOA1ye5NsnLgEngXf3leyQ5t/+4G5JsT5L+8X4uyb/2j/eZJEfNer7HJPlskocs8D+l1CkjJy3O94CnVtWjgBOBN84EAXgosL2qjgW+BfxekkOBtwCnVdWjgQuA1yzwOR8JrK2qY6rq4cDb59n3H6vqMVX1CHqXYDurqq6kd63Rl1bVI6vqtcAUcEZ/+X+Bv+g/7hjgHsBv9o/3LmBb/3iPB26feaJ+PM8HNlfVLQv8m6ROrR71ANIKFeBPkzwR+D9699X72f62W6vqk/3f/xZ4EfAB4BjgQ/0WrmIgFEO6BXhIkrcA/wJ8cJ59j0nyJ8B9gcPpXW90GCcm+UPgnsBPA7uSfIxeXN8DUFXfA+j/HQ8DtgO/VlXekULLjpGTFucMYAJ4dFX9IMkXgbv3t82+IGzRi+KuqvqlxT5hVX09ySOAXwdeADwdeO4+dr8QOLWqrktyJnDC/o6f5O7AW4HJqro1yavo/U1z3Udwxu39fY7D2y5pGfLlSmlx7gPc0Q/cicCDB7atTzITsy3AvwE3AxMz65McmuQXF/KESdYAh1TVu4FXAI+aZ/d7Abf3XyY9Y2D9t/vb5lqeifRXkhwOnAZQVd8CppOc2p/jbjPvMwLfAH6D3lntCQv5e6SDwchJi/MuYDLJFL2I/MfAtpuAZye5nt5Lfn9ZVXfSi8Zrk1wHXEvvva2FWAt8LMm19M7UXj7Pvq8APg18aNZslwAv7X9I5Kj+cc7vH/P7wF8DnwPeS+/+gTOeCbyo/zddCTxgZkNVfRn4LWBbkscu8G+SOuWtdiRJzfJMTpLULD94Ii0TSd4DHDlr9cuqap+fjEyyDXjCrNVvrqr5vl4gjQ1frpQkNcuXKyVJzTJykqRmGTlJUrOMnCSpWUZOktSs/wfAG4XuidOsSwAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 504x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"f, ax = plt.subplots(figsize=(7, 5))\n",
"sns.countplot(x='label_is_attack', data=df, order = df['label_is_attack'].value_counts().index, ax=ax)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label_is_attack</th>\n",
" <th>percentage</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13484708</td>\n",
" <td>0.8307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2748235</td>\n",
" <td>0.1693</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" label_is_attack percentage\n",
"0 13484708 0.8307\n",
"1 2748235 0.1693"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_is_attack = df.groupby('label_is_attack').agg({'label_is_attack':'first', 'label_is_attack':'count'}).rename_axis(None).sort_values('label_is_attack', ascending=False)\n",
"df_is_attack['percentage'] = df_is_attack / df_is_attack.loc[:'label_is_attack'].sum()\n",
"df_is_attack"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. Number of flows per attack type\n",
"\n",
"The following graph shows the number of flows accounting for the different attack types. \n",
"The graph reveals an under-representation of attack types in the following categories compared with the other attack types:\n",
"* DoS attacks-GoldenEye\n",
"* DoS attacks-Slowloris\n",
"* DDOS attack-LOIC-UDP\n",
"* Brute Force-Web\n",
"* Brute Force-XSS\n",
"* SQL Injection \t"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x1abc7d3eb8>"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABQQAAAGtCAYAAABAwnmuAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOzdebidZXnv8e8PAk0wEIYAgoKRWQGNEgc0KFREpFil0OIpWJFqtIoRKFiPI844HDkiKkYLKOJwFLSIRakIAorVTCQBiShD61DDLJMI8T5/rGfrcrP3zt5J9l6B9f1cV66s9xnv903+uq/7ed9UFZIkSZIkSZL6w3q9DkCSJEmSJEnSxDEhKEmSJEmSJPURE4KSJEmSJElSHzEhKEmSJEmSJPURE4KSJEmSJElSH5nU6wCk6dOn14wZM3odhiRJkiRJ0iPGggULbqmqLYfqMyGonpsxYwbz58/vdRiSJEmSJEmPGEluGq7PI8OSJEmSJElSH7FCUD334M23cfMnP9/rMCRJkiRJktjyn47sdQjjzgpBSZIkSZIkqY+YEJQkSZIkSZL6iAlBSZIkSZIkqY+YEJQkSZIkSZL6iAlBSZIkSZIkqY+YEJQkSZIkSZL6iAnBR7gkK5MsTnJVkoVJnrUGa70ryf5rMz5JkiRJkiRNrEm9DkDj7r6qmgmQ5AXA+4Hnrs5CVfX2tRmYJEmSJEmSJp4Vgv1lE+D2gYskJyb5cZIlSd7Z2mYk+UmSTye5OslFSaa0vrOSHNZ+H5Tk2iRXJDk1yQWt/aQkZyS5NMn1Seb24D4lSZIkSZI0DBOCj3xT2pHha4HPAO8GSHIAsDPwdGAmsFeS57Q5OwMfr6rdgTuAQ7sXTDIZ+BTwwqqaDWw5aM/dgBe0td+RZIPBQSWZk2R+kvm33v3btXSrkiRJkiRJWhUTgo9891XVzKraDTgQ+FySAAe0P4uAhXSSeDu3OTdU1eL2ewEwY9CauwHXV9UN7fqLg/q/WVX3V9UtwApg68FBVdW8qppVVbO2mLrJmt2hJEmSJEmSRs13CPaRqroyyXQ6FX0B3l9Vn+oek2QGcH9X00pgyqClsoqtBs/3/5kkSZIkSdI6wgrBPpJkN2B94Fbg28DRSaa2vsck2WqUS10L7NCShwCHr+VQJUmSJEmSNE6s3Hrkm5Jk4PhvgJdX1UrgoiRPAK7snCDmbuBIOhV9I6qq+5K8FvhWkluAH41P6JIkSZIkSVrbTAg+wlXV+iP0fRT46BBde3SN+XDX76O6xlxSVbu19xF+HJjfxpw0aI89kCRJkiRJ0jrDI8NaXa9qlYdXA9PofHVYkiRJkiRJ6zgrBLVaquoU4JRexyFJkiRJkqSxsUJQkiRJkiRJ6iMmBCVJkiRJkqQ+4pFh9dykLTdny386stdhSJIkSZIk9QUrBCVJkiRJkqQ+YkJQkiRJkiRJ6iMmBCVJkiRJkqQ+YkJQkiRJkiRJ6iN+VEQ998CK/+ZXHz++12FIkiRJ0oi2fd1Heh2CJK0VVghKkiRJkiRJfcSEoCRJkiRJktRHTAhKkiRJkiRJfcSEoCRJkiRJktRHTAhKkiRJkiRJfcSEoCRJkiRJktRH+johmGRlksVJrk5yVZLjk6zX+vZNcmeSRUmWJ7ksycGD5s9Jcm3786Mks7v6Dm5zr0pyTZJXjyGumUkO6rreN8mz1uA+7x7FmJOSnDCo7cYk09vvxyb5tyTXJfl5ko8m2bArvgu65r0wyfwkP2nP5sOrG7skSZIkSZLWrr5OCAL3VdXMqtodeD5wEPCOrv7Lq+opVbUrMBc4LcnzoJPwA14NzK6q3YDXAF9I8ugkGwDzgBdV1ZOBpwCXjiGumS2WAfsCq50QXFNJApwHfL2qdgZ2AaYC7x1i7B7AacCRVfUEYA/g+gkMV5IkSZIkSSPo94TgH1XVCmAOcExLgA3uXwy8CzimNf0LcGJV3dL6FwKfBV4HbAxMAm5tffdX1fLBayZ5epIftErCHyTZtVXdvQs4vFUv/gudZONx7XqfJC9K8p9t3neSbN3Wm5rkzCRLkyxJcuig/aYnuTLJX43x8fwl8LuqOrPdz0rgOODoJBsNGvtG4L1VdW0b+2BVfWKM+0mSJEmSJGmcTOp1AOuSqrq+HRneapghC4ET2+/dgQWD+ucDL6+q25KcD9yU5GLgAuCLVfWHQeOvBZ5TVQ8m2R94X1UdmuTtwKyqOgYgyRTg7qr6cLveDHhmVVWSV9JJwv0z8Dbgzqras2sc7ffWwPnAW6vqP4a5v+OSHNl1ve1w91pVv03yX8BOg9bYA/g/w6wvSZIkSZKkHjMh+FAPqQ4cZd9AfwFU1SuT7AnsD5xA50jyUYPGTwM+m2TnNm+DUcb4WODLSbYBNgRuaO37Ay8dGFRVt7efGwAXA6+rqu+NsO4pA0lH6LxDcPB9DTJc+yolmUOnIpPHbLbx6iwhSZIkSZKk1eCR4S5JdgBWAiuGGfIU4Cft9zXAXoP6n9raAaiqpVV1Cp1k4KE81LuBS6pqD+BFwORRhvox4LRWCfjqrnnDJegepFPh94KBhiTvbUeQF49iv6uBWd0NSTYBtgN+PsTYwc/lIapqXlXNqqpZW0ydMooQJEmSJEmStDaYEGySbAmcTifR9pCkWpIn0TmS+/HW9EHgA0m2aP0z6VQAfqK9y2/frukzgZuG2HYa8Mv2+6iu9rvovIdwuOvueS/var+IP73jsPvIcAFHA7sleRNAVb2lfVBl5hBxDXYxsFGSf2jrrk/nWPBZVXXvoLEfAt6cZJc2dr0kx49iD0mSJEmSJE2Afk8ITmlVclcD36GTUHtnV/8+7cMdy+kkAudW1cUAVXU+cAbwgyTXAp+m82XdX9Op1HtjkuWtAu+dPPS4MHSSiu9P8n1g/a72S4AnttgOB74BHDLwURHgJOArSS4Hbuma9x5gsyTLklwF7DfQ0T4E8lJgvySvHctDagnSQ4C/TXId8FPgd8Cbhxi7BDgW+GKSnwDLgG3Gsp8kSZIkSZLGT4YohpMm1JO337ou/Jcjeh2GJEmSJI1o29d9pNchSNKoJVlQVbOG6uv3CkFJkiRJkiSpr5gQlCRJkiRJkvqICUFJkiRJkiSpj5gQlCRJkiRJkvqICUFJkiRJkiSpj0zqdQDSBltt59e6JEmSJEmSJogVgpIkSZIkSVIfMSEoSZIkSZIk9RETgpIkSZIkSVIfMSEoSZIkSZIk9RE/KqKeu+fmn3HlvIN7HYYkSZKkPrD3nAt6HYIk9ZwVgpIkSZIkSVIfMSEoSZIkSZIk9RETgpIkSZIkSVIfMSEoSZIkSZIk9RETgpIkSZIkSVIfMSEoSZIkSZIk9ZGeJwSTrEyyOMnVSa5KcnyS9VrfvknuTLIoyfIklyU5eBRr7prk0rbuT5LMG0M8myZ5bdf1jCR/v3p3By2OWas59+5h2uckubb9+VGS2UPtl2Rqkk8l+Xl7vpclecYQ6x2V5LSh4k7yn+05/leSm9vvxUl+M0z7jCQ3Jlna/j0vSvLo1bl/SZIkSZIkrX2Teh0AcF9VzQRIshXwBWAa8I7Wf3lVHdz6ZwJfT3JfVV08wpqnAqdU1b+1eXuOIZ5NgdcCn2jXM4C/b3H1XEuIvhqYXVW3JHkqnWfy9Kr6n0HDPwPcAOxcVX9IsgPwhLHsV1XPaPseBcyqqmMGxfOQ9iQA+7X43ge8GZg7ln0lSZIkSZI0PnpeIditqlYAc4Bj0rJKg/oXA+8CjgFI8rgkFydZ0v7evg3dBvhF17ylg9dq1XMXJ1nYqtle3LpOBnZs1W4fatf7tOvjWgXc5W3ewiTP6lrzjV2VcScP2m+9JJ9N8p4k6yc5K8myNv64MTymfwFOrKpb2r0tBD4LvG7QfjsCzwDeWlV/aGOvr6pvjmGvteEyYKcJ3lOSJEmSJEnDWBcqBP9MVV3fjgxvNcyQhcCJ7fdpwOeq6rNJjqZTGfgS4BTgu0l+AFwEnFlVdwxa53fAIVX12yTTgR8mOR94E7BHV9XivsAJXVWKGwHPr6rfJdkZ+CIwK8kL297PqKp7k2zetdck4BxgWVW9N8lewGOqao+25qZjeES7AwsGtc0HXj7EuMVVtXKU6x7effSYtZfEOxgYKiE7h07yl603n7KWtpIkSZIkSdKqrFMVgl0eUh04TN/e/Oko79nAbICqOpPO0divAPvSSfb9xRDrvC/JEuA7wGOArUcR2wbAp5Msbes/sbXvTyfxeG+L4bauOZ+iJQPb9fXADkk+luRA4Lej2HckAWoN1/hyVc0c+EMnybgmLkmyGNgEeP/gzqqaV1WzqmrWZlM3XMOtJEmSJEmSNFrrXEKwveduJbBimCFPAX4yTN8fk2JV9auqOqOqXgw8COwxaOwRwJbAXi0B9htg8ihCPK6NfTIwCxjIZo2UlPsBsF+SyS2229v8S+kc9f1Mku26PszxmhH2vwbYa1DbU1t7t6uBJ7dqyz+T5HVde207wl5rYr+WXPyHIaozJUmSJEmS1CPrVEIwyZbA6cBpVfWQ5FqSJwFvAz7emn4AvLT9PgK4oo07MMkG7fejgS2AXw5abhqwoqoeSLIf8LjWfhewcde4wdfTgF+39/K9DFi/tV8EHN2OFDPoyPC/Av8OfCXJpHZEeb2qOrfdz1Or6r+7KvROH+ExfRD4QJIt2j4zgaP400dQAKiqn9Op8nvnwPsYk+yc5MVV9fGuvX41wl6SJEmSJEl6hFkX3iE4pR0t3YBOJd/ZwEe6+vdJsgjYiE7V4NyuLwzPBc5IciJwM/CK1n4A8NEkv2vXJw7xBd5zgG8kmQ8sBq4FqKpbk3w/yTLgQjpfyH0wyVXAWXQSb+cm+VvgEuCeNu9bLTk3P8nv6SQA3zywWVV9JMm0dn8nA2d2Ve/972GezUZJftF1/ZG2zmOAHyQpOgnLI6vq10PMfyXwf4CfJbkXuJU/vX9RkiRJkiRJfShDFOJJE+oJj9u0znjL7FUPlCRJkqQ1tPecC3odgiRNiCQLqmrWUH3r1JFhSZIkSZIkSePLhKAkSZIkSZLUR0wISpIkSZIkSX3EhKAkSZIkSZLUR9aFrwyrzz1qy518sa8kSZIkSdIEsUJQkiRJkiRJ6iMmBCVJkiRJkqQ+YkJQkiRJkiRJ6iMmBCVJkiRJkqQ+4kdF1HO333IdXz3zwF6HIUmSJGkddtgrvtXrECTpEcMKQUmSJEmSJKmPmBCUJEmSJEmS+ogJQUmSJEmSJKmPmBCUJEmSJEmS+ogJQUmSJEmSJKmPmBCUJEmSJEmS+ogJwWEkWZlkcZKrk1yV5PgkIz6vJBslOSfJ0iTLklyRZOoY9jw2yUZd129eg/hPSnLCas69NMmsrusZSZaNYt6NSaaPdrwkSZIkSZImngnB4d1XVTOranfg+cBBwDtWMecNwG+qas+q2gP4R+CBMex5LLBR1/VqJwQlSZIkSZKkoZgQHIWqWgHMAY5Jx+QkZ7ZKwEVJ9mtDtwF+2TVveVXdP3i9JJ9MMr9VH76ztc0FtgUuSXJJkpOBKa1K8Zw25utJFrR5c7rWOzDJwlbJePEQ+70qyYVJpiSZm+SaJEuSfGmszyLJUUlO67q+IMm+I4zfoT2jp411L0mSJEmSJK19k3odwMNFVV3fjgxvBRzZ2vZMshtwUZJdgDPa78OAi4HPVtV1Qyz3lqq6Lcn6wMVJnlRVpyY5Htivqm4BSHJMVc3smnd0mzcF+HGSc+kkdT8NPKeqbkiyefdGSY4BDgBeUlX3J3kT8Pj2e9MRbvmcJPe13xsCfxjD4xrYe1fgS8ArqmrxoL45dJKsTN9i8liXliRJkiRJ0mqyQnBs0v6eDZwNUFXXAjcBu7Sk1w7Ah4DN6STtnjDEOn+XZCGwCNgdeOIo95+b5Crgh8B2wM7AM4HLquqGFs9tXeNfBrwQOLSrUnEJnWTfkcCDI+x1RDsyPZPOcemx2hL4N+DIwcnAFue8qppVVbM2mbrhaiwvSZIkSZKk1WFCcJSS7ACsBFbwp8TgQ1TV3VV1XlW9Fvg8g5JpSR4PnAA8r6qeBHwTWGWJXDuWuz+wd1U9mU4ycXKLpYaZtgyYATy2q+2vgI8DewELkkxK8u12NPkzq4qDThKx+//NcLHfCfw38OxRrClJkiRJkqQJYkJwFJJsCZwOnFZVBVwGHNH6dgG2B5YneXaSzVr7hnQq/24atNwmwD3AnUm2plPBN+AuYOOu6weSbNB+TwNur6p72zHlZ7b2K4HntkQjg44MLwJeDZyfZNt25Hm7qroEeCOwKTC1ql7QqgFfOYrHcSMwM8l6SbYDnj7MuN8DLwH+Icnfj2JdSZIkSZIkTQDfITi8KUkWAxvQqYo7G/hI6/sEcHqSpa3vqPZOvh2BTyYJnWTrN4FzuxetqquSLAKuBq4Hvt/VPQ+4MMmvq2q/dr2kHS8+GnhNkiXAcjrHhqmqm9v7+M5rCb8VdL6KPLDfFUlOaLEcAHw+yTQ6lYWnVNUdY3wu3wduAJbSqUBcONzAqronycHAfyS5p6r+bYx7SZIkSZIkaS1Lp+BN6p0dZ0yrD7xj716HIUmSJGkddtgrvtXrECTpYSXJgqqaNVSfR4YlSZIkSZKkPmJCUJIkSZIkSeojJgQlSZIkSZKkPmJCUJIkSZIkSeojfmVYPbfZ9J19QbAkSZIkSdIEsUJQkiRJkiRJ6iMmBCVJkiRJkqQ+YkJQkiRJkiRJ6iMmBCVJkiRJkqQ+4kdF1HMrbruOU895Qa/DkCRJkvra3CO+3esQJEkTxApBSZIkSZIkqY+YEJQkSZIkSZL6iAlBSZIkSZIkqY+YEJQkSZIkSZL6iAlBSZIkSZIkqY+YEJQkSZIkSZL6iAlBjVqSlUkWJ7kqycIkz1rF+BlJ/n6i4pMkSZIkSdKqmRDUWNxXVTOr6snA/wbev4rxMwATgpIkSZIkSesQE4JaXZsAtwOk40NJliVZmuTwNuZkYJ9WVXhczyKVJEmSJEnSH03qdQB6WJmSZDEwGdgG+MvW/jfATODJwHTgx0kuA94EnFBVB/ciWEmSJEmSJD2UFYIai4Ejw7sBBwKfSxJgNvDFqlpZVb8Bvgc8baSFksxJMj/J/Lt/+/vxj1ySJEmSJEmACUGtpqq6kk414JZAVmP+vKqaVVWzpm6y4VqPT5IkSZIkSUMzIajVkmQ3YH3gVuAy4PAk6yfZEngO8CPgLmDj3kUpSZIkSZKkwXyHoMZi4B2C0KkKfHlVrUzyNWBv4CqggDdW1f8kuRV4MMlVwFlVdUpvwpYkSZIkSdIAE4Iatapaf5j2Ak5sf7rbHwCeNwGhSZIkSZIkaZQ8MixJkiRJkiT1EROCkiRJkiRJUh8xIShJkiRJkiT1EROCkiRJkiRJUh8xIShJkiRJkiT1Eb8yrJ7bavOdmXvEt3sdhiRJkiRJUl+wQlCSJEmSJEnqIyYEJUmSJEmSpD5iQlCSJEmSJEnqIyYEJUmSJEmSpD7iR0XUczfecR2v+NqBvQ5DkiRJ6pkzD/lWr0OQJPURKwQlSZIkSZKkPmJCUJIkSZIkSeojJgQlSZIkSZKkPmJCUJIkSZIkSeojJgQlSZIkSZKkPmJCUJIkSZIkSeojJgTXQJKVSRZ3/Xl11++7kyxvvz+XZN8kdyZZlOQnSd4xzJpnJbmhzbt2uHGriGtmkoNGMW4gpoGYvzPWvSRJkiRJkvTwMqnXATzM3VdVMwe1fQogyaXACVU1v13vC1xeVQcneRSwOMkFVbVgiHVPrKqvJpkMXJPkc1V1Q/eAJOtX1cph4poJzAL+fRT3cHlVHTyKcX8myaSqenCs8yRJkiRJktRbVgj2QFXdAywAdlzF0Mnt73sAktyY5O1JrgD+NsmlSWa1vumtf0PgXcDhrerv8CSPSnJGkh+3CsUXj7RpkscluTjJkvb39q39rCQfSXIJ8IEkU5OcmWRpG3toG3dAkiuTLEzylSRTV/dZSZIkSZIkae0yIbhmpnQdt/3aaCcl2QJ4JnD1MEM+lGQx8AvgS1W1oqvvd1U1u6q+NNTEqvo98Hbgy1U1s6q+DLwF+G5VPQ3Yr63/qDZln657eEtrOw34XFU9CTgHOLVri12A/avqn4G3AXdW1Z5t7HeTTAfe2sY8FZgPHD/EM5iTZH6S+b/77e9HelySJEmSJElaizwyvGaGOjI8kn2SLAL+AJxcVcMlBAeODE8FLk7yrKr6Qev78mrEeQDw10lOaNeTge3b76GODO8N/E37fTbwwa6+r3QdVd4feOlAR1XdnuRg4InA95MAbAhcOTigqpoHzAOYvtO0Wo17kiRJkiRJ0mowITixHpJ8S3Im8BTgV1X1Zx8Cqaq727sIZwMDCcF7uoY8yJ+qPCczvACHVtXyQXtvPcq4uxN23ftnUN9A239U1f8a5dqSJEmSJEmaQB4Z7rGqekU72vuQrwInmQQ8A/j5MNNvBPZqvw/rar8L2Ljr+tvA69NK9pI8ZRVh/YA/Vf4dAVwxzLiLgGO64t0M+CHw7CQ7tbaNkuyyiv0kSZIkSZI0QUwIrpsG3iG4BFgKnDfMuA8D/5TkB8D0rvZLgCcOfFQEeDewAbAkybJ2PZK5wCuSLAFeBrxhmHHvATZLsizJVcB+VXUzcBTwxTb/h8Buq9hPkiRJkiRJEyRVvr5NvTV9p2n1og/t3eswJEmSpJ4585Bv9ToESdIjTJIFVTVrqD4rBCVJkiRJkqQ+YkJQkiRJkiRJ6iMmBCVJkiRJkqQ+YkJQkiRJkiRJ6iOTeh2ANGPTnX2JsiRJkiRJ0gSxQlCSJEmSJEnqIyYEJUmSJEmSpD5iQlCSJEmSJEnqIyYEJUmSJEmSpD7iR0XUc9fd8WsO+tp7eh2GJEmSNKH+/ZC39joESVKfskJQkiRJkiRJ6iMmBCVJkiRJkqQ+YkJQkiRJkiRJ6iMmBCVJkiRJkqQ+YkJQkiRJkiRJ6iMmBCVJkiRJkqQ+YkJwDSV5S5KrkyxJsjjJM5IcnGRRkquSXJPk1W3sSUlOGDT/xiTTh1j3qCQ3tzWvTvLVJBuNMbZNk7x2lGPnJvlJknPGsockSZIkSZIeXkwIroEkewMHA0+tqicB+wP/A8wDXlRVTwaeAly6mlt8uapmVtXuwO+Bw4eIYdII8zcFRpUQbOMOqqojRjN4FftKkiRJkiRpHWVCcM1sA9xSVfcDVNUtwF3AJODW1nZ/VS1fk01a8u1RwO3t+qwkH0lyCfCBwZWHSZYlmQGcDOzYqgw/1PpOTPLjVtH4ztZ2OrADcH6S45JsnuTr
gitextract_m34ow7k6/ ├── .dockerignore ├── .gitattributes ├── .github/ │ └── workflows/ │ ├── build.yml │ ├── deployment.yml │ └── train.yml ├── .gitignore ├── .idea/ │ ├── .gitignore │ ├── deployment.xml │ ├── inspectionProfiles/ │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── ml-ids.iml │ ├── modules.xml │ └── vcs.xml ├── .pylintrc ├── Makefile ├── README.md ├── data/ │ ├── README.md │ └── Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv ├── environment-notebook.yaml ├── environment.yaml ├── ml_ids/ │ ├── __init__.py │ ├── conf.py │ ├── data/ │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── metadata.py │ │ └── split_dataset.py │ ├── keras/ │ │ ├── __init__.py │ │ ├── callbacks.py │ │ ├── evaluation.py │ │ ├── metrics.py │ │ ├── model_selection.py │ │ └── prediction.py │ ├── libs/ │ │ └── dfencoder/ │ │ └── dataframe.py │ ├── model_selection.py │ ├── models/ │ │ ├── __init__.py │ │ └── gradient_boost/ │ │ ├── __init__.py │ │ ├── mlflow_wrapper.py │ │ └── train.py │ ├── prediction.py │ ├── tf_utils.py │ ├── transform/ │ │ ├── __init__.py │ │ ├── preprocessing.py │ │ └── sampling.py │ └── visualization.py ├── models/ │ └── gradient_boost/ │ ├── envs/ │ │ ├── local/ │ │ │ └── train.py │ │ └── sagemaker/ │ │ ├── configs/ │ │ │ ├── deploy.json │ │ │ ├── train-cpu.json │ │ │ └── train-gpu.json │ │ ├── container/ │ │ │ ├── Dockerfile │ │ │ └── train.py │ │ └── scripts/ │ │ ├── build_image.sh │ │ ├── deploy.py │ │ ├── push_image_to_ecr.sh │ │ ├── train.py │ │ └── undeploy.py │ ├── project/ │ │ ├── MLproject │ │ ├── conda.yaml │ │ └── train.py │ ├── training_params.json │ └── training_params_quick_run.json ├── notebooks/ │ ├── 01_data-cleanup/ │ │ └── data_cleanup.ipynb │ ├── 02_exploratory-data-analysis/ │ │ └── exploratory_data_analysis.ipynb │ ├── 03_ml-prototype/ │ │ ├── ml-prototype.ipynb │ │ └── models/ │ │ └── gradient_boost_model.cbm │ ├── 04_ml-prototype-spark/ │ │ ├── ml-prototype-spark.ipynb │ │ └── models/ │ │ ├── gb-model/ │ │ │ ├── bestModel/ │ │ │ │ ├── data/ │ │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ │ ├── .part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── .part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ │ ├── _SUCCESS │ │ │ │ │ ├── part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ ├── part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ │ └── part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── metadata/ │ │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ │ ├── .part-00000.crc │ │ │ │ │ ├── _SUCCESS │ │ │ │ │ └── part-00000 │ │ │ │ └── treesMetadata/ │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── _SUCCESS │ │ │ │ ├── part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ └── part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ ├── estimator/ │ │ │ │ └── metadata/ │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-00000.crc │ │ │ │ ├── _SUCCESS │ │ │ │ └── part-00000 │ │ │ ├── evaluator/ │ │ │ │ └── metadata/ │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-00000.crc │ │ │ │ ├── _SUCCESS │ │ │ │ └── part-00000 │ │ │ └── metadata/ │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ └── pipeline-model/ │ │ ├── metadata/ │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ └── stages/ │ │ ├── 0_ValueCleaner_57f061a9e393/ │ │ │ └── metadata/ │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ ├── 1_Imputer_3f8cf4b571a8/ │ │ │ ├── data/ │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet.crc │ │ │ │ ├── _SUCCESS │ │ │ │ └── part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet │ │ │ └── metadata/ │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ ├── 2_OneHotEncoderEstimator_f1dc6e50f52e/ │ │ │ ├── data/ │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet.crc │ │ │ │ ├── _SUCCESS │ │ │ │ └── part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet │ │ │ └── metadata/ │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ ├── 3_VectorAssembler_ef6b7bf933ee/ │ │ │ └── metadata/ │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ └── 4_BinaryLabelMaker_3b174e5e0c29/ │ │ └── metadata/ │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 │ ├── 05_anomaly_detection/ │ │ ├── dl-anomaly-detection.ipynb │ │ ├── models/ │ │ │ ├── denoising_autoencoder_model.h5 │ │ │ ├── simple_autoencoder_model.h5 │ │ │ └── stacked_autoencoder_model.h5 │ │ └── notebook_utils.py │ ├── 06_dl_classifier/ │ │ ├── dl-classifier.ipynb │ │ ├── models/ │ │ │ ├── c0cb0656-558f-4311-b138-9b91ab4d1fe6.h5 │ │ │ ├── model_class_weight.h5 │ │ │ ├── model_no_class_weights.h5 │ │ │ └── opt_model.h5 │ │ └── notebook_utils.py │ └── 07_binary_classifier_comparison/ │ ├── binary-classifier-comparison.ipynb │ ├── models/ │ │ └── gb_835066e8-2427-48ca-a521-67195008cb91.catboost │ └── notebook_utils.py ├── setup.cfg ├── setup.py ├── tests/ │ ├── data/ │ │ └── test_dataset.py │ ├── transform/ │ │ └── test_preprocessing.py │ └── validation_data/ │ └── validation.csv └── upload.py
SYMBOL INDEX (105 symbols across 27 files)
FILE: ml_ids/data/dataset.py
function remove_inf_values (line 12) | def remove_inf_values(df: pd.DataFrame) -> pd.DataFrame:
function remove_negative_values (line 25) | def remove_negative_values(df: pd.DataFrame, ignore_cols: List[str] = No...
function add_label_category_column (line 45) | def add_label_category_column(df: pd.DataFrame) -> pd.DataFrame:
function add_label_is_attack_columns (line 56) | def add_label_is_attack_columns(df: pd.DataFrame) -> pd.DataFrame:
function load_dataset_generic (line 68) | def load_dataset_generic(load_df_fn,
function load_dataset (line 107) | def load_dataset(dataset_path: str,
function load_dataset_hdf (line 139) | def load_dataset_hdf(dataset_path: str,
FILE: ml_ids/data/split_dataset.py
function split_dataset (line 34) | def split_dataset(dataset_path, output_path, val_size, test_size, nrows,...
function remove_extra_labels (line 58) | def remove_extra_labels(dataset: pd.DataFrame):
function save_dataset (line 67) | def save_dataset(dataset: pd.DataFrame, path: str, ds_type: str):
FILE: ml_ids/keras/callbacks.py
class OneCycleScheduler (line 11) | class OneCycleScheduler(callbacks.Callback):
method __init__ (line 16) | def __init__(self, iterations, max_rate, start_rate=None,
method _interpolate (line 26) | def _interpolate(self, iter1, iter2, rate1, rate2):
method on_batch_begin (line 30) | def on_batch_begin(self, batch, logs):
FILE: ml_ids/keras/evaluation.py
function evaluate_model (line 7) | def evaluate_model(model, X_train, y_train, X_val, y_val, metric_title):
FILE: ml_ids/keras/metrics.py
class AveragePrecisionScoreMetric (line 14) | class AveragePrecisionScoreMetric(callbacks.Callback):
method __init__ (line 19) | def __init__(self, X_val, y_val, batch_size=4096):
method get_precision_score (line 25) | def get_precision_score(self):
method on_epoch_end (line 36) | def on_epoch_end(self, epoch, logs):
FILE: ml_ids/keras/model_selection.py
function cross_val_train (line 11) | def cross_val_train(fit_fn,
FILE: ml_ids/keras/prediction.py
function predict (line 7) | def predict(model, X, decision_boundary=0.5):
function predict_proba (line 22) | def predict_proba(model, X):
FILE: ml_ids/libs/dfencoder/dataframe.py
class EncoderDataFrame (line 36) | class EncoderDataFrame(pd.DataFrame):
method __init__ (line 37) | def __init__(self, *args, **kwargs):
method swap (line 40) | def swap(self, likelihood=.15):
FILE: ml_ids/model_selection.py
function train_val_test_split (line 11) | def train_val_test_split(df: pd.DataFrame,
function split_x_y (line 46) | def split_x_y(df: pd.DataFrame, y_cols: List[str] = None) -> Tuple[pd.Da...
function best_precision_for_target_recall (line 60) | def best_precision_for_target_recall(y_true, y_pred_score, target_recall):
FILE: ml_ids/models/gradient_boost/mlflow_wrapper.py
class CatBoostWrapper (line 10) | class CatBoostWrapper(mlflow.pyfunc.PythonModel):
method load_context (line 15) | def load_context(self, context):
method preprocess (line 28) | def preprocess(self, data):
method predict (line 40) | def predict(self, context, model_input):
FILE: ml_ids/models/gradient_boost/train.py
function fit_pipeline (line 21) | def fit_pipeline(train_dataset):
function preprocess_val_dataset (line 41) | def preprocess_val_dataset(pipeline, val_dataset):
function preprocess_train_dataset (line 55) | def preprocess_train_dataset(pipeline, train_dataset, nr_attack_samples,...
function calculate_class_weights (line 76) | def calculate_class_weights(y_train):
function train_gb_classifier (line 87) | def train_gb_classifier(train_pool,
function train_model (line 127) | def train_model(train_dataset: pd.DataFrame,
FILE: ml_ids/prediction.py
function predict_proba_positive (line 6) | def predict_proba_positive(clf, X):
function predict_decision_boundary (line 18) | def predict_decision_boundary(clf, X, decision_boundary=0.5):
FILE: ml_ids/tf_utils.py
function enable_gpu_memory_growth (line 7) | def enable_gpu_memory_growth():
FILE: ml_ids/transform/preprocessing.py
function remove_outliers (line 15) | def remove_outliers(df: pd.DataFrame, zscore: int = 3) -> pd.DataFrame:
function create_pipeline (line 27) | def create_pipeline(df: pd.DataFrame,
FILE: ml_ids/transform/sampling.py
function upsample_minority_classes (line 10) | def upsample_minority_classes(X: np.ndarray,
function create_sample_dict (line 45) | def create_sample_dict(df: pd.DataFrame,
function downsample (line 69) | def downsample(df: pd.DataFrame,
FILE: ml_ids/visualization.py
function plot_hist (line 14) | def plot_hist(hist,
function plot_confusion_matrix (line 49) | def plot_confusion_matrix(y_true,
function identity (line 120) | def identity(x):
function plot_threshold (line 127) | def plot_threshold(pred_train, pred_val, threshold, size=(15, 5), transf...
function get_misclassifications (line 147) | def get_misclassifications(y, y_true, pred):
function print_binary_performance (line 165) | def print_binary_performance(y, y_true, pred, print_misclassifications=T...
function plot_pr_curve (line 195) | def plot_pr_curve(y_true, y_score, size=(8, 5), average='weighted'):
function plot_pr_curves (line 219) | def plot_pr_curves(y_true, y_score_dict, size=(8, 5), average='weighted'):
function plot_pr_threshold_curves (line 244) | def plot_pr_threshold_curves(y_true, y_pred_score, size=(20, 8)):
FILE: models/gradient_boost/envs/local/train.py
function merge (line 8) | def merge(dict1, dict2):
function train (line 31) | def train(train_path, val_path, test_path, output_path, param_path):
FILE: models/gradient_boost/envs/sagemaker/container/train.py
function merge (line 25) | def merge(dict1, dict2):
FILE: models/gradient_boost/envs/sagemaker/scripts/deploy.py
function unpack (line 13) | def unpack(file):
function deploy (line 34) | def deploy(config_path, job_id):
FILE: models/gradient_boost/envs/sagemaker/scripts/train.py
function create_performance_metric_regex (line 10) | def create_performance_metric_regex(id):
function create_metric_def (line 20) | def create_metric_def(name, regex):
function get_metric_definitions (line 30) | def get_metric_definitions():
function train (line 55) | def train(config_path, param_path, image_name, mode, job_id):
FILE: models/gradient_boost/envs/sagemaker/scripts/undeploy.py
function undeploy (line 9) | def undeploy(config_path):
FILE: models/gradient_boost/project/train.py
function load_dataset (line 22) | def load_dataset(path):
function load_train_val_test_dataset (line 33) | def load_train_val_test_dataset(train_path, val_path, test_path):
function measure_performance (line 44) | def measure_performance(clf, pipeline, dataset):
function save_artifacts (line 68) | def save_artifacts(cbm_model_path, classifier, pipeline_path, pipeline, ...
function train (line 109) | def train(train_path,
FILE: notebooks/05_anomaly_detection/notebook_utils.py
function predict (line 10) | def predict(model, X, y):
function evaluate_pr_roc (line 17) | def evaluate_pr_roc(pred):
function plot_evaluation_curves (line 23) | def plot_evaluation_curves(pred):
function plot_pr_threshold_curves (line 45) | def plot_pr_threshold_curves(pred, pr_plot_lim=[0, 1]):
function best_precision_for_target_recall (line 68) | def best_precision_for_target_recall(pred, target_recall):
function get_misclassifications (line 73) | def get_misclassifications(y, pred_binary):
function print_performance (line 83) | def print_performance(y, pred, threshold):
function filter_benign (line 101) | def filter_benign(X, y):
FILE: notebooks/06_dl_classifier/notebook_utils.py
function transform_data (line 9) | def transform_data(dataset,
FILE: notebooks/07_binary_classifier_comparison/notebook_utils.py
function get_best_model_path (line 9) | def get_best_model_path(trials, model_path_var='model_path'):
function print_trial_results (line 13) | def print_trial_results(trials, best_run, model_path_var='model_path'):
function transform_data (line 23) | def transform_data(dataset,
FILE: tests/data/test_dataset.py
function val_data (line 10) | def val_data():
function inf_value_count (line 15) | def inf_value_count(df):
function neg_value_count (line 19) | def neg_value_count(df):
function nan_value_count (line 25) | def nan_value_count(df):
function negative_value_columns (line 29) | def negative_value_columns(df):
function test_loaded_dataset_must_not_contain_inf_values (line 34) | def test_loaded_dataset_must_not_contain_inf_values():
function test_loaded_dataset_must_not_contain_negative_values (line 40) | def test_loaded_dataset_must_not_contain_negative_values():
function test_loaded_dataset_must_not_contain_negative_values_except_excluded_cols (line 46) | def test_loaded_dataset_must_not_contain_negative_values_except_excluded...
function test_loaded_dataset_must_contain_label_category (line 53) | def test_loaded_dataset_must_contain_label_category():
function test_loaded_dataset_must_contain_label_is_attack (line 59) | def test_loaded_dataset_must_contain_label_is_attack():
function test_loaded_dataset_must_replace_invalid_value_with_nan (line 70) | def test_loaded_dataset_must_replace_invalid_value_with_nan(val_data):
function test_loaded_dataset_must_contain_only_specified_columns (line 79) | def test_loaded_dataset_must_contain_only_specified_columns():
function test_loaded_dataset_must_omit_specified_columns (line 85) | def test_loaded_dataset_must_omit_specified_columns():
FILE: tests/transform/test_preprocessing.py
function feature_df (line 13) | def feature_df():
function nan_value_count (line 19) | def nan_value_count(x):
function test_pipeline_must_impute_all_missing_values (line 23) | def test_pipeline_must_impute_all_missing_values(feature_df):
function test_pipeline_must_impute_selected_columns_only (line 34) | def test_pipeline_must_impute_selected_columns_only(feature_df):
function test_pipeline_must_not_impute_values_if_imputer_strategy_none (line 49) | def test_pipeline_must_not_impute_values_if_imputer_strategy_none(featur...
function test_pipeline_must_reorder_columns (line 61) | def test_pipeline_must_reorder_columns(feature_df):
function test_pipeline_must_impute_all_missing_values_with_mean (line 75) | def test_pipeline_must_impute_all_missing_values_with_mean(feature_df):
function test_pipeline_must_impute_all_missing_values_with_median (line 89) | def test_pipeline_must_impute_all_missing_values_with_median(feature_df):
function test_pipeline_must_scale_all_values (line 103) | def test_pipeline_must_scale_all_values(feature_df):
function test_pipeline_must_one_hot_encode_categorical_values (line 111) | def test_pipeline_must_one_hot_encode_categorical_values(feature_df):
Condensed preview — 219 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (6,788K chars).
[
{
"path": ".dockerignore",
"chars": 39,
"preview": "build/\ndata/\nnotebooks/\ntests/\ndataset/"
},
{
"path": ".gitattributes",
"chars": 239,
"preview": "*.csv filter=lfs diff=lfs merge=lfs -text\ntests/validation_data/*.csv -filter=lfs -diff=lfs -merge=lfs -text\n*.catboost "
},
{
"path": ".github/workflows/build.yml",
"chars": 786,
"preview": "name: Build\n\non: [push]\n\njobs:\n build:\n runs-on: ubuntu-latest\n\n steps:\n - uses: actions/checkout@v1\n\n - na"
},
{
"path": ".github/workflows/deployment.yml",
"chars": 1368,
"preview": "name: Deploy Model on AWS Sagemaker\n\non:\n deployment\n\njobs:\n deploy:\n name: Deploy\n runs-on: ubuntu-latest\n\n "
},
{
"path": ".github/workflows/train.yml",
"chars": 2419,
"preview": "name: Train Model on AWS Sagemaker\n\non:\n push:\n tags:\n - 'm*'\n\njobs:\n train:\n name: Deploy\n runs-on: ubunt"
},
{
"path": ".gitignore",
"chars": 1759,
"preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
},
{
"path": ".idea/.gitignore",
"chars": 39,
"preview": "\n# Default ignored files\n/workspace.xml"
},
{
"path": ".idea/deployment.xml",
"chars": 514,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n <component name=\"PublishConfigData\" autoUpload=\"Always\" s"
},
{
"path": ".idea/inspectionProfiles/profiles_settings.xml",
"chars": 174,
"preview": "<component name=\"InspectionProjectProfileManager\">\n <settings>\n <option name=\"USE_PROJECT_PROFILE\" value=\"false\" />\n"
},
{
"path": ".idea/misc.xml",
"chars": 371,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n <component name=\"JavaScriptSettings\">\n <option name=\"l"
},
{
"path": ".idea/ml-ids.iml",
"chars": 771,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<module type=\"PYTHON_MODULE\" version=\"4\">\n <component name=\"NewModuleRootManager"
},
{
"path": ".idea/modules.xml",
"chars": 264,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n <component name=\"ProjectModuleManager\">\n <modules>\n "
},
{
"path": ".idea/vcs.xml",
"chars": 180,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n <component name=\"VcsDirectoryMappings\">\n <mapping dire"
},
{
"path": ".pylintrc",
"chars": 18011,
"preview": "[MASTER]\n\n# A comma-separated list of package or module names from where C extensions may\n# be loaded. Extensions are lo"
},
{
"path": "Makefile",
"chars": 1995,
"preview": "SAGEMAKER_TRAIN_CONFIG_PATH=models/gradient_boost/envs/sagemaker/configs/train-gpu.json\nSAGEMAKER_DEPLOY_CONFIG_PATH=mod"
},
{
"path": "README.md",
"chars": 13376,
"preview": "# A machine learning based approach towards building an Intrusion Detection System\n\n## Problem Description\nWith the risi"
},
{
"path": "data/README.md",
"chars": 1973,
"preview": "## Data\n\nThe data used to train the classifiers is taken from the [CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-"
},
{
"path": "data/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv",
"chars": 134,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:acff8bc61376ee031d80878ee6099e0b1a87a1bd711d8068298421418c9f8147\ns"
},
{
"path": "environment-notebook.yaml",
"chars": 951,
"preview": "name: ml-ids-notebooks\nchannels:\n - anaconda\n - conda-forge\n - defaults\ndependencies:\n - catboost=0.18.1=py37_0\n - "
},
{
"path": "environment.yaml",
"chars": 838,
"preview": "name: ml-ids\nchannels:\n - anaconda\n - conda-forge\n - defaults\ndependencies:\n - catboost=0.18.1=py37_0\n - click=7.0="
},
{
"path": "ml_ids/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ml_ids/conf.py",
"chars": 240,
"preview": "\"\"\"\nGlobal configuration variables.\n\"\"\"\nimport os\n\nROOT_DIR = os.sep.join(os.path.dirname(os.path.abspath(__file__)).spl"
},
{
"path": "ml_ids/data/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ml_ids/data/dataset.py",
"chars": 6738,
"preview": "\"\"\"\nUtilities to manipulate the CIC-IDS-2018 dataset.\n\"\"\"\nfrom typing import List\nimport os\nimport glob\nimport numpy as "
},
{
"path": "ml_ids/data/metadata.py",
"chars": 3489,
"preview": "\"\"\"\nMetadata of the CIC-IDS-2018 dataset.\n\"\"\"\nCOLUMN_DTYPES = {\n 'dst_port': 'uint32',\n 'protocol': 'uint8',\n '"
},
{
"path": "ml_ids/data/split_dataset.py",
"chars": 3040,
"preview": "\"\"\"\nCLI to split a single dataset into train/val/test sub-datasets.\n\"\"\"\nimport os\nimport sys\nimport logging\nimport click"
},
{
"path": "ml_ids/keras/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ml_ids/keras/callbacks.py",
"chars": 1710,
"preview": "\"\"\"\nCustom callbacks for Keras models.\n\"\"\"\n# pylint: disable=import-error\nfrom tensorflow import keras\nfrom tensorflow.k"
},
{
"path": "ml_ids/keras/evaluation.py",
"chars": 962,
"preview": "\"\"\"\nUtility functions to evaluate Keras models.\n\"\"\"\nPREDICT_BATCH_SIZE = 16384\n\n\ndef evaluate_model(model, X_train, y_tr"
},
{
"path": "ml_ids/keras/metrics.py",
"chars": 1438,
"preview": "\"\"\"\nUtilities to create custom metrics for Keras models.\n\"\"\"\n# pylint: disable=import-error\nimport gc\nimport numpy as np"
},
{
"path": "ml_ids/keras/model_selection.py",
"chars": 2550,
"preview": "\"\"\"\nUtility functions for model selection of Keras models.\n\"\"\"\nimport gc\nfrom typing import Tuple\nimport numpy as np\nfro"
},
{
"path": "ml_ids/keras/prediction.py",
"chars": 1258,
"preview": "\"\"\"\nUtility functions to create predictions using Keras models.\n\"\"\"\nPREDICT_BATCH_SIZE = 16384\n\n\ndef predict(model, X, d"
},
{
"path": "ml_ids/libs/dfencoder/dataframe.py",
"chars": 2863,
"preview": "# Copyright (c) 2019, Michael Klear.\n# All rights reserved.\n#\n# Redistribution and use in source and binary forms, with "
},
{
"path": "ml_ids/model_selection.py",
"chars": 2729,
"preview": "\"\"\"\nUtilities for machine learning model selection.\n\"\"\"\nfrom typing import Tuple, List\nimport numpy as np\nimport pandas "
},
{
"path": "ml_ids/models/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ml_ids/models/gradient_boost/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ml_ids/models/gradient_boost/mlflow_wrapper.py",
"chars": 1391,
"preview": "\"\"\"\nWrapper to enable usage of a CatBoost estimator with MLflow.\n\"\"\"\nimport pickle\nimport mlflow.pyfunc\nfrom catboost im"
},
{
"path": "ml_ids/models/gradient_boost/train.py",
"chars": 7106,
"preview": "\"\"\"\nUtilities to train a machine learning estimator based on the Gradient Boosting algorithm using the CatBoost library."
},
{
"path": "ml_ids/prediction.py",
"chars": 1282,
"preview": "\"\"\"\nUtilities to create predictions given a Scikit-learn estimator and a dataset containing input features.\n\"\"\"\n\n\ndef pr"
},
{
"path": "ml_ids/tf_utils.py",
"chars": 357,
"preview": "\"\"\"\nUtility functions for TensorFlow.\n\"\"\"\nimport tensorflow as tf\n\n\ndef enable_gpu_memory_growth():\n \"\"\"\n Enables "
},
{
"path": "ml_ids/transform/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "ml_ids/transform/preprocessing.py",
"chars": 3403,
"preview": "\"\"\"\nUtilities for data pre-processing.\n\"\"\"\nfrom typing import List\nimport numpy as np\nimport pandas as pd\nfrom sklearn.p"
},
{
"path": "ml_ids/transform/sampling.py",
"chars": 3236,
"preview": "\"\"\"\nUtilities to modify the amount of samples of specific categories in a datasets.\n\"\"\"\nimport numpy as np\nimport pandas"
},
{
"path": "ml_ids/visualization.py",
"chars": 8879,
"preview": "\"\"\"\nVisualization utilities for IPython Notebooks.\n\"\"\"\n# pylint: disable=import-error\nimport numpy as np\nimport pandas a"
},
{
"path": "models/gradient_boost/envs/local/train.py",
"chars": 1591,
"preview": "import json\nimport click\nimport mlflow\nimport shutil\nimport os\n\n\ndef merge(dict1, dict2):\n \"\"\"\n Merges two diction"
},
{
"path": "models/gradient_boost/envs/sagemaker/configs/deploy.json",
"chars": 387,
"preview": "{\n \"deploy\": {\n \"app_name\": \"ml-ids-classifier\",\n \"instance_type\": \"ml.t2.medium\",\n \"instance_count\": 1,\n \""
},
{
"path": "models/gradient_boost/envs/sagemaker/configs/train-cpu.json",
"chars": 415,
"preview": "{\n \"train\": {\n \"instance_type\": \"ml.m5.large\",\n \"instance_count\": 1,\n \"task_type\": \"CPU\"\n },\n \"role\": \"arn:a"
},
{
"path": "models/gradient_boost/envs/sagemaker/configs/train-gpu.json",
"chars": 422,
"preview": "{\n \"train\": {\n \"instance_type\": \"ml.p2.xlarge\",\n \"instance_count\": 1,\n \"task_type\": \"GPU\"\n },\n \"role\": \"arn:"
},
{
"path": "models/gradient_boost/envs/sagemaker/container/Dockerfile",
"chars": 1623,
"preview": "FROM nvidia/cuda:10.1-base\n\n# Install Miniconda 3\nENV LANG=C.UTF-8 LC_ALL=C.UTF-8\nENV PATH /opt/conda/bin:$PATH\n\nRUN apt"
},
{
"path": "models/gradient_boost/envs/sagemaker/container/train.py",
"chars": 2135,
"preview": "#!/usr/bin/env python\n\nimport sys\nimport os\nimport json\nimport traceback\nimport uuid\nimport mlflow\n\nprefix = '/opt/ml/'\n"
},
{
"path": "models/gradient_boost/envs/sagemaker/scripts/build_image.sh",
"chars": 391,
"preview": "#!/usr/bin/env bash\n\nimage_name=$1\nimage_version=$2\n\nif [ \"$image_name\" == \"\" ]\nthen\n echo \"Usage: $0 <image-name>\"\n "
},
{
"path": "models/gradient_boost/envs/sagemaker/scripts/deploy.py",
"chars": 2162,
"preview": "import click\nimport json\nimport boto3\nimport tarfile\nimport re\nimport logging\nfrom mlflow import sagemaker\n\nlogging.basi"
},
{
"path": "models/gradient_boost/envs/sagemaker/scripts/push_image_to_ecr.sh",
"chars": 1206,
"preview": "#!/usr/bin/env bash\n\nimage_name=$1\nimage_version=$2\n\nif [ \"$image_name\" == \"\" ]\nthen\n echo \"Usage: $0 <image-name>\"\n "
},
{
"path": "models/gradient_boost/envs/sagemaker/scripts/train.py",
"chars": 3700,
"preview": "import json\nimport click\nimport logging\nfrom sagemaker.estimator import Estimator\n\nlogging.basicConfig(level=logging.INF"
},
{
"path": "models/gradient_boost/envs/sagemaker/scripts/undeploy.py",
"chars": 476,
"preview": "import click\nimport json\nfrom mlflow import sagemaker\n\n\n@click.command()\n@click.option('--config-path', type=click.Path("
},
{
"path": "models/gradient_boost/project/MLproject",
"chars": 1502,
"preview": "name: gradient_boost_model\n\nconda_env: conda.yaml\n\nentry_points:\n main:\n parameters:\n train_path: path\n va"
},
{
"path": "models/gradient_boost/project/conda.yaml",
"chars": 415,
"preview": "name: ml-ids-gradient-boost-catboost\nchannels:\n - anaconda\n - conda-forge\n - defaults\ndependencies:\n - python=3.7\n "
},
{
"path": "models/gradient_boost/project/train.py",
"chars": 7749,
"preview": "import click\nimport logging\nimport mlflow\nimport mlflow.pyfunc\nimport pickle\nimport os\nimport shutil\nfrom catboost impor"
},
{
"path": "models/gradient_boost/training_params.json",
"chars": 210,
"preview": "{\n \"task_type\": \"GPU\",\n \"use_val_set\": true,\n \"nr_iterations\": 2000,\n \"tree_depth\": 10,\n \"l2_reg\": 4.81391937494595"
},
{
"path": "models/gradient_boost/training_params_quick_run.json",
"chars": 106,
"preview": "{\n \"task_type\": \"GPU\",\n \"nr_iterations\": 10,\n \"nr_samples_attack_category\": 1000,\n \"random_seed\": 42\n}"
},
{
"path": "notebooks/01_data-cleanup/data_cleanup.ipynb",
"chars": 31941,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"## Data Cleanup\\n\",\n \"\\n\",\n \""
},
{
"path": "notebooks/02_exploratory-data-analysis/exploratory_data_analysis.ipynb",
"chars": 413948,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"## Exploratory Data Analysis\\n\",\n "
},
{
"path": "notebooks/03_ml-prototype/ml-prototype.ipynb",
"chars": 3632302,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Machine Learning Prototype\\n\",\n "
},
{
"path": "notebooks/03_ml-prototype/models/gradient_boost_model.cbm",
"chars": 133,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:f9ff34d59ef5e2a1040b921b0b1d7565c63e4fd8d9bf4d080cf31a5e9ee13fc0\ns"
},
{
"path": "notebooks/04_ml-prototype-spark/ml-prototype-spark.ipynb",
"chars": 34012,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Scaled ML Prototype\\n\",\n \"\\n\","
},
{
"path": "notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/part-00000",
"chars": 763,
"preview": "{\"class\":\"org.apache.spark.ml.classification.GBTClassificationModel\",\"timestamp\":1568299022309,\"sparkVersion\":\"2.4.4\",\"u"
},
{
"path": "notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/part-00000",
"chars": 682,
"preview": "{\"class\":\"org.apache.spark.ml.classification.GBTClassifier\",\"timestamp\":1568299022265,\"sparkVersion\":\"2.4.4\",\"uid\":\"GBTC"
},
{
"path": "notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/part-00000",
"chars": 368,
"preview": "{\"class\":\"org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\",\"timestamp\":1568299022228,\"sparkVersion\":\"2."
},
{
"path": "notebooks/04_ml-prototype-spark/models/gb-model/metadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/gb-model/metadata/part-00000",
"chars": 5028,
"preview": "{\"class\":\"org.apache.spark.ml.tuning.CrossValidatorModel\",\"timestamp\":1568299022196,\"sparkVersion\":\"2.4.4\",\"uid\":\"CrossV"
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/part-00000",
"chars": 350,
"preview": "{\"class\":\"pyspark.ml.pipeline.PipelineModel\",\"timestamp\":1568299021142,\"sparkVersion\":\"2.4.4\",\"uid\":\"PipelineModel_aec85"
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/part-00000",
"chars": 526,
"preview": "{\"class\":\"__main__.ValueCleaner\",\"timestamp\":1568299021262,\"sparkVersion\":\"2.4.4\",\"uid\":\"ValueCleaner_57f061a9e393\",\"par"
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/part-00000",
"chars": 650,
"preview": "{\"class\":\"org.apache.spark.ml.feature.ImputerModel\",\"timestamp\":1568299021353,\"sparkVersion\":\"2.4.4\",\"uid\":\"Imputer_3f8c"
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/part-00000",
"chars": 280,
"preview": "{\"class\":\"org.apache.spark.ml.feature.OneHotEncoderModel\",\"timestamp\":1568299021798,\"sparkVersion\":\"2.4.4\",\"uid\":\"OneHot"
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/part-00000",
"chars": 847,
"preview": "{\"class\":\"org.apache.spark.ml.feature.VectorAssembler\",\"timestamp\":1568299021967,\"sparkVersion\":\"2.4.4\",\"uid\":\"VectorAss"
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/_SUCCESS",
"chars": 0,
"preview": ""
},
{
"path": "notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/part-00000",
"chars": 236,
"preview": "{\"class\":\"__main__.BinaryLabelMaker\",\"timestamp\":1568299022005,\"sparkVersion\":\"2.4.4\",\"uid\":\"BinaryLabelMaker_3b174e5e0c"
},
{
"path": "notebooks/05_anomaly_detection/dl-anomaly-detection.ipynb",
"chars": 739871,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Anomaly Detection\\n\",\n \"\\n\",\n "
},
{
"path": "notebooks/05_anomaly_detection/models/denoising_autoencoder_model.h5",
"chars": 133,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:13f9ca921d4d76f3a745450fa844e22c2d5716440efcc22c2170f3bc0f21f179\ns"
},
{
"path": "notebooks/05_anomaly_detection/models/simple_autoencoder_model.h5",
"chars": 130,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:18216e715acf520b92ba511d4a27f37b90377887540a1d2b1217d46b41d7d93a\ns"
},
{
"path": "notebooks/05_anomaly_detection/models/stacked_autoencoder_model.h5",
"chars": 131,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:db4d61f4d8ee4e9d43db255afcac4c2443aea48268ea9ea867783460cdfa065d\ns"
},
{
"path": "notebooks/05_anomaly_detection/notebook_utils.py",
"chars": 3692,
"preview": "import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.metrics import classification_report"
},
{
"path": "notebooks/06_dl_classifier/dl-classifier.ipynb",
"chars": 711065,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Deep Learning Classifier\\n\",\n "
},
{
"path": "notebooks/06_dl_classifier/models/c0cb0656-558f-4311-b138-9b91ab4d1fe6.h5",
"chars": 132,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:8efc348b48af452153dec068d1367cec784ffc2930049df4eaf371d10c0d1caa\ns"
},
{
"path": "notebooks/06_dl_classifier/models/model_class_weight.h5",
"chars": 132,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:60e8392296780d912e8bff335bdadb81deae5b035925d2282e963d45def4ce95\ns"
},
{
"path": "notebooks/06_dl_classifier/models/model_no_class_weights.h5",
"chars": 132,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:e0d9dff11e5600e74974a8e6657be10f7af7d1abe7b66ea2308d9d6eea4d29eb\ns"
},
{
"path": "notebooks/06_dl_classifier/models/opt_model.h5",
"chars": 132,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:d941b094728d0e970231c2f40440da9bfe2c9c5f9898954064954483d210857a\ns"
},
{
"path": "notebooks/06_dl_classifier/notebook_utils.py",
"chars": 3461,
"preview": "import numpy as np\nimport gc\nfrom ml_ids.model_selection import split_x_y, train_val_test_split\nfrom ml_ids.transform.sa"
},
{
"path": "notebooks/07_binary_classifier_comparison/binary-classifier-comparison.ipynb",
"chars": 535684,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Binary Classifier Comparison\\n\",\n"
},
{
"path": "notebooks/07_binary_classifier_comparison/models/gb_835066e8-2427-48ca-a521-67195008cb91.catboost",
"chars": 133,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:ceccc696d2c5eae0d550425f772221088e7a66b26a626461642e14c2b42099ce\ns"
},
{
"path": "notebooks/07_binary_classifier_comparison/notebook_utils.py",
"chars": 3959,
"preview": "import numpy as np\nimport gc\nfrom ml_ids.model_selection import split_x_y, train_val_test_split\nfrom ml_ids.transform.sa"
},
{
"path": "setup.cfg",
"chars": 505,
"preview": "[aliases]\ntest=pytest\n\n[mypy-numpy.*]\nignore_missing_imports = True\n\n[mypy-pandas.*]\nignore_missing_imports = True\n\n[myp"
},
{
"path": "setup.py",
"chars": 547,
"preview": "from distutils.core import setup\n\nsetup(\n name='ml-ids',\n version='0.1',\n description='Machine learning based I"
},
{
"path": "tests/data/test_dataset.py",
"chars": 2546,
"preview": "import pytest\nimport pandas as pd\nimport numpy as np\nimport os\nfrom ml_ids import conf\nfrom ml_ids.data.dataset import l"
},
{
"path": "tests/transform/test_preprocessing.py",
"chars": 4884,
"preview": "import pytest\nimport numpy as np\nfrom numpy.testing import assert_array_equal\nfrom sklearn.preprocessing import Function"
},
{
"path": "tests/validation_data/validation.csv",
"chars": 76512,
"preview": ",dst_port,protocol,timestamp,flow_duration,tot_fwd_pkts,tot_bwd_pkts,totlen_fwd_pkts,totlen_bwd_pkts,fwd_pkt_len_max,fwd"
},
{
"path": "upload.py",
"chars": 0,
"preview": ""
}
]
// ... and 110 more files (download for full content)
About this extraction
This page contains the full source code of the cstub/ml-ids GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 219 files (6.0 MB), approximately 1.6M tokens, and a symbol index with 105 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.