Repository: NetManAIOps/LogClass Branch: master Commit: 4ae7c999c290 Files: 44 Total size: 84.4 KB Directory structure: gitextract_jrwg3tth/ ├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── compare_pu.py ├── decorators.py ├── feature_engineering/ │ ├── __init__.py │ ├── length.py │ ├── registry.py │ ├── tf.py │ ├── tf_idf.py │ ├── tf_ilf.py │ ├── utils.py │ └── vectorizer.py ├── init_params.py ├── logclass.py ├── models/ │ ├── __init__.py │ ├── base_model.py │ ├── binary_registry.py │ ├── multi_registry.py │ ├── pu_learning.py │ ├── regular.py │ └── svm.py ├── preprocess/ │ ├── __init__.py │ ├── bgl_preprocessor.py │ ├── open_source_logs.py │ ├── registry.py │ └── utils.py ├── puLearning/ │ ├── __init__.py │ └── puAdapter.py ├── reporting/ │ ├── __init__.py │ ├── accuracy.py │ ├── bb_registry.py │ ├── confusion_matrix.py │ ├── macrof1.py │ ├── microf1.py │ ├── multi_class_acc.py │ ├── top_k_svm.py │ └── wb_registry.py ├── requirements.txt ├── run_binary.py ├── train_binary.py ├── train_multi.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class *.pyc # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ output/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # vs settings .vscode/ # data data/** !data/open_source_logs/ !data/open_source_logs/** ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 Federico Zaiter Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ ## LogClass This repository provides an open-source toolkit for LogClass framework from W. Meng et al., "[LogClass: Anomalous Log Identification and Classification with Partial Labels](https://ieeexplore.ieee.org/document/9339940)," in IEEE Transactions on Network and Service Management, doi: 10.1109/TNSM.2021.3055425. LogClass automatically and accurately detects and classifies anomalous logs based on partial labels. ### Table of Contents [LogClass](#logclass) - [Table of Contents](#table-of-contents) - [Requirements](#requirements) - [Quick Start](#quick-start) - [Run LogClass](#run-logclass) - [Arguments](#arguments) - [Directory Structure](#directory-structure) - [Datasets](#datasets) - [How to](#how-to) - [How to add a new dataset](#how-to-add-a-new-dataset) - [Preprocessed Logs Format](#preprocessed-logs-format) - [How to run a new experiment](#how-to-run-a-new-experiment) - [Custom experiment](#custom-experiment) - [How to add a new model](#how-to-add-a-new-model) - [How to extract a new feature](#how-to-extract-a-new-feature) - [Included Experiments](#included-experiments) - [Testing PULearning](#testing-pulearning) - [Testing Anomaly Classification](#testing-anomaly-classification) - [Global LogClass](#global-logclass) - [Binary training/inference](#binary-traininginference) - [Citing](#citing) ​ ### Requirements Requirements are listed in `requirements.txt`. To install these, run: ``` pip install -r requirements.txt ``` ### Quick Start #### Run LogClass Several example experiments using LogClass are included in this repository. Here is an example to run one of them - training of the global experiment doing anomaly detection and classification. Run the following command in the home directory of this project: ``` python -m LogClass.logclass --train --kfold 3 --logs_type "bgl" --raw_logs "./Data/RAS_LOGS" --report macro ``` #### Arguments ``` python -m LogClass.logclass --help usage: logclass.py [-h] [--raw_logs raw_logs] [--base_dir base_dir] [--logs logs] [--models_dir models_dir] [--features_dir features_dir] [--logs_type logs_type] [--kfold kfold] [--healthy_label healthy_label] [--features features [features ...]] [--report report [report ...]] [--binary_classifier binary_classifier] [--multi_classifier multi_classifier] [--train] [--force] [--id id] [--swap] Runs binary classification with PULearning to detect anomalous logs. optional arguments: -h, --help show this help message and exit --raw_logs raw_logs input raw logs file path (default: None) --base_dir base_dir base output directory for pipeline output files (default: ['{your_logclass_dir}\\output']) --logs logs input logs file path and output for raw logs preprocessing (default: None) --models_dir models_dir trained models input/output directory path (default: None) --features_dir features_dir trained features_dir input/output directory path (default: None) --logs_type logs_type Input type of logs. (default: ['open_Apache']) --kfold kfold kfold crossvalidation (default: None) --healthy_label healthy_label the labels of unlabeled logs (default: ['unlabeled']) --features features [features ...] Features to be extracted from the logs messages. (default: ['tfilf']) --report report [report ...] Reports to be generated from the model and its predictions. (default: None) --binary_classifier binary_classifier Binary classifier to be used as anomaly detector. (default: ['pu_learning']) --multi_classifier multi_classifier Multi-clas classifier to classify anomalies. (default: ['svm']) --train If set, logclass will train on the given data. Otherwiseit will run inference on it. (default: False) --force Force training overwriting previous output with same id. (default: False) --id id Experiment id. Automatically generated if not specified. (default: None) --swap Swap testing/training data in kfold cross validation. (default: False) ``` #### Directory Structure ``` . ├── data │   └── open_source_logs # Included open-source log datasets │   ├── Apache │   ├── bgl │   ├── hadoop │   ├── hdfs │   ├── hpc │   ├── proxifier │   └── zookeeper ├── output # Example output folder │   ├── preprocessed_logs # Saved preprocessed logs for reuse │   │   ├── open_Apache.txt │   │   └── open_bgl.txt │   └── train_multi_open_bgl_2283696426 # Example experiment output │      ├── best_params.json │      ├── features │      │   ├── tfidf.pkl │      │   └── vocab.pkl │      ├── models │      │   └── multi.pkl │      └── results.csv ├── feature_engineering │   ├── __init__.py │   ├── length.py │   ├── tf_idf.py │   ├── tf_ilf.py │   ├── tf.py │   ├── registry.py │   ├── vectorizer.py # Log message vectorizing utilities │   └── utils.py ├── models │   ├── __init__.py │   ├── base_model.py # BaseModel class extended by all models │   ├── pu_learning.py │   ├── regular.py │   ├── svm.py │   ├── binary_registry.py │   └── multi_registry.py ├── preprocess │   ├── __init__.py │   ├── bgl_preprocessor.py │   ├── open_source_logs.py │   ├── registry.py │   └── utils.py ├── reporting │   ├── __init__.py │   ├── accuracy.py │   ├── confusion_matrix.py │   ├── macrof1.py │   ├── microf1.py │   ├── multi_class_acc.py │   ├── top_k_svm.py │   ├── bb_registry.py │   └── wb_registry.py ├── puLearning # PULearning third party implementation │   ├── __init__.py │   └── puAdapter.py ├── __init__.py ├── LICENSE ├── README.md ├── requirements.txt ├── init_params.py # Parses arguments, initializes global parameters ├── logclass.py # Performs training and inference of LogClass ├── test_pu.py # Compares robustness of LogClass ├── train_multi.py # Trains LogClass for anomalies classification ├── train_binary.py # Trains LogClass for log anomaly detection ├── run_binary.py # Loads trained LogClass and detects anomalies ├── decorators.py └── utils.py ``` #### Datasets In this repository we include various [open-source logs datasets](https://github.com/logpai/loghub) in the `data` folder as well as their corresponding preprocessing module in the `preprocess` package. Additionally there is another preprocessor provided for [BGL logs data](https://www.usenix.org/cfdr-data#hpc4), which can be downloaded directly from [here](https://www.usenix.org/sites/default/files/4372-intrepid_ras_0901_0908_scrubbed.zip.tar). ### How to Explain how to use and extend this toolkit. #### How to add a new dataset Add a new preprocessor module in the `preprocess` package. The module should implement a function that follows the `preprocess_datset(params)` function template included in all preprocessors. It should be decorated with `@register(f"{dataset_name}")` , e.g. open_Apache, and call the `process_logs(input_source, output, process_line)` function. This `process_line` function should also be defined in the processor as well. When done, add the module name to the `__init__.py` list of modules from the `preprocess` package and also the name from the decorator in the argsparse parameters options as the logs type. For example, `--logs_type open_Apache`. ##### Preprocessed Logs Format This format is ensured by the `process_line` function which is to be defined in each preprocessor. ```python def process_line(line): """ Processes a given line from the raw logs. Parameter --------- line : str One line from the raw logs. Returns ------- str String with the format f"{label} {msg}" where the `label` indicates whether the log is anomalous and if so, which anomaly category, and `msg` is the filtered log message without parameters. """ # your code ``` To filter the log message parameters, use the `remove_parameters(msg)`function from the `utils.py` module in the `preprocess` package. #### How to run a new experiment Several experiments examples are included in the repository. The best way to start with creating a new one is to follow the example from the others, specially the main function structure and its experiment function be it training or testing focused. The key things to consider the experiment should include are the following: - **Args parsing**: create custom `init_args()` and `parse_args(args)` functions for your experiment that call `init_main_args()` from the `init_params.py` module. - **Output file handling**: use `file_handling(params)` function (see `utils.py` in the main directory of the repo). - **Preprocessing raw logs**: if `--raw_logs` argument is provided, get the preprocessing function using the `--logs_type` argument from the `preprocess` module registry calling `get_preprocessor(f'{logs_type}')` function. - **Load logs**: call the `load_logs(params, ...)` function to get the preprocessed logs from the directory specified in the `--logs` parameter. It will return a tuple of x, y, and target label names data. ##### Custom experiment Main functions to consider for a custom experiment. Usually in its own function. **Feature Engineering** - `extract_features(x, params)` from `feature_engineering` package's `utils.py` module: Extracts all specified features in `--features` parameter from the preprocessed logs. See the function definition for further details. - `build_vocabulary(x)` from `feature_engineering` package's `vectorizer.py` module: Divides log into tokens and creates vocabulary. See the function definition for further details. - `log_to_vector(x, vocabulary)` from `feature_engineering` package's `vectorizer.py` module: Vectorizes each log message using a dict of words to index. See the function definition for further details. - `get_features_vector(x_vector, vocabulary, params)` from `feature_engineering` package's `utils.py` module: Extracts all specified features from the vectorized logs. See the function definition for further details. **Model training and inference** Each model extends the `BaseModel` class from module `base_model.py`. See the class definition for further details. There are two registries in the `models` package, one for binary models meant to be used for anomaly detection and another one for multi-classification models to classify the anomalies. Get the constructor for either using the `--binary_classifier` or `--multi_classifier` argument specified. E.g. `binary_classifier_registry.get_binary_model(params['binary_classifier'])`. By extending `BaseModel` the model is always saved when it fits the data. Load a model by calling its `load()` method. It will use the `params` attribute of the `BaseModel` class to get the experiment id and load its corresponding model. To save the params of an experiment call the `save_params(params)` function from the `utils.py` module in the main directory. `load_params(params)` in case of only using the module for inference. **Reporting** There are two kinds of reports, black box and white box and a registry for each in the `reporting` module. To use them, call the corresponding registry and obtain the report wrapper using `black_box_report_registry.get_bb_report('acc')`, for example. To add new reports, see the analogous explanation for [models](#how-to-add-a-new-model) or [features](#how-to-extract-a-new-feature) below. **Saving results** Among the provided experiments, `test_pu.py` and `train_multi.py` save their results creating a dict of column names to lists of results. Then the `save_results.py` function from the `utils.py` module is used to save them to a CSV file. #### How to add a new model To add a new model, implement a class that extends the `BaseModel` class and include its module in the `models` package. See the class definition for further details. Decorate a method that calls its constructor and returns an instance of the model with the `@register(f"{model_name}")`decorator from either the `binary_registry.py` or the `multi_registry.py` modules from the `models` package depending on whether the model is for anomaly detection or classification respectively. Finally, make sure you add the module's name in the `__init__.py` module from the `models` package and the model option in the `init_params.py` module within the list for either `--binary_classifier` or `multi_classifier` arguments. This way the constructor can be obtained by doing `binary_classifier_registry.get_binary_model(params['binary_classifier'])`, for example. #### How to extract a new feature To add a new feature extractor, create a module in the `feature_engineering` package that wraps your feature extractor function and returns the features. See `length.py` module as an example for further details. As in the other cases, decorate the wrapper function with `@register(f"{feature_name}")` and make sure you add the module name in the `__init__.py` from the `feature_engineering` package and the feature as an option in the `init_params.py` module `--features` argument. ### Included Experiments High level overview of each of the experiments included in the repository. #### Testing PULearning `test_pu.py` is mainly focused on proving the robustness of LogClass for anomaly detection when just providing few labeled data as anomalous. It would compare PULearning+RandomForest with any other given anomaly detection algorithm. Using the given data, it would start with having only healthy logs on the unlabeled data and gradually increase this up to 10%. To test PULearning, run the following command in the home directory of this project: ``` python -m LogClass.test_pu --logs_type "bgl" --raw_logs "./Data/RAS from Weibin/RAS_raw_label.dat" --binary_classifier regular --ratio 8 --step 1 --top_percentage 11 --kfold 3 ``` This would first preprocess the logs. Then, for each kfold iteration, it will perform feature extraction and force a 1:8 ratio of anomalous:healthy logs. Finally with a step of 1% it will go from 0% to 10% anomalous logs in the unlabeled set and compare the accuracy of both anomaly detection algorithms. If none specified it will default to a plain RF. #### Testing Anomaly Classification `train_multi.py` is focused on showing the robustness of LogClass' TF-ILF feature extraction approach for multi-class anomaly classification. The main detail is that when using `--kfold N`, one can swap training/testing data slices using the `--swap` flag. This way, for instance, it can train on 10% logs and test on the remaining 90%, when pairing `--swap` with n ==10. To run such an experiment, use the following command from the parent directory of the project: ``` python -m LogClass.train_multi --logs_type "open_Apache" --raw_logs "./Data/open_source_logs/" --kfold 10 --swap ``` #### Global LogClass `logclass.py` is set up so that it does both training or testing of the learned models depending on the flags. For example to train and preprocessing run the following command in the home directory of this project: : ``` python -m LogClass.logclass --train --kfold 3 --logs_type "bgl" --raw_logs "./Data/RAS_LOGS" ``` This would first preprocess the raw BGL logs and extract their TF-ILF features, then train and save both PULearning with a RandomForest for anomaly detection and an SVM for multi-class anomaly classification. For running inference simply run: ``` python -m LogClass.logclass --logs_type ``` In this case it would load the learned feature extraction approach, both learned models and run inference on the whole logs. #### Binary training/inference `train_binary.py` and `run_binary.py` simply separate the binary part of `logclass.py` into two modules: one for training both feature extraction and the models, and another one for loading these and running inference. ### Citing If you find LogClass is useful for your research, please consider citing the paper: ``` @ARTICLE{9339940, author={Meng, Weibin and Liu, Ying and Zhang, Shenglin and Zaiter, Federico and Zhang, Yuzhe and Huang, Yuheng and Yu, Zhaoyang and Zhang, Yuzhi and Song, Lei and Zhang, Ming and Pei, Dan}, journal={IEEE Transactions on Network and Service Management}, title={LogClass: Anomalous Log Identification and Classification with Partial Labels}, year={2021}, doi={10.1109/TNSM.2021.3055425} } ``` This code was completed by [@Weibin Meng](https://github.com/WeibinMeng) and [@Federico Zaiter](https://github.com/federicozaiter). ================================================ FILE: __init__.py ================================================ __all__ = ["utils", "logclass"] from .preprocess import * from .feature_engineering import * from .models import * from .reporting import * ================================================ FILE: compare_pu.py ================================================ from sklearn.model_selection import StratifiedKFold from .utils import ( file_handling, TestingParameters, print_params, save_results, ) from .preprocess import registry as preprocess_registry from .preprocess.utils import load_logs from .feature_engineering.utils import ( binary_train_gtruth, extract_features, ) from .models import binary_registry as binary_classifier_registry from .reporting import bb_registry as black_box_report_registry from .init_params import init_main_args, parse_main_args import numpy as np def init_args(): """Init command line args used for configuration.""" parser = init_main_args() parser.add_argument( "--ratio", metavar="ratio", type=int, nargs=1, default=[8], help="ratio", ) parser.add_argument( "--top_percentage", metavar="top_percentage", type=int, nargs=1, default=[11], help="top_percentage", ) parser.add_argument( "--step", metavar="step", type=int, nargs=1, default=[2], help="step", ) return parser.parse_args() def parse_args(args): """Parse provided args for runtime configuration.""" params = parse_main_args(args) additional_params = { "ratio": args.ratio[0], "top_percentage": args.top_percentage[0], "step": args.step[0], "train": True, } params.update(additional_params) return params def force_ratio(params, x_data, y_data): """Force a ratio between anomalous and healthy logs""" ratio = params['ratio'] if ratio > 0: anomalous = np.where(y_data == 1.0)[0] healthy = np.where(y_data == -1.0)[0] if len(anomalous) * ratio <= len(healthy): keep_anomalous = len(anomalous) keep_healthy = keep_anomalous * ratio else: keep_anomalous = len(healthy) // ratio keep_healthy = len(healthy) np.random.seed(10) permut = np.random.permutation(len(healthy)) keep = permut[:keep_healthy] healthy = healthy[keep] permut = np.random.permutation(len(anomalous)) keep = permut[:keep_anomalous] anomalous = anomalous[keep] result = sorted(np.concatenate((anomalous, healthy))) y_data = y_data[result] x_data = x_data[result] return x_data, y_data def init_results(params): results = { 'exp_name': [], 'logs_type': [], 'percentage': [], 'pu_f1': [], f"{params['binary_classifier']}_f1": [], } return results def add_result(results, params, percentage, pu_acc, b_clf_acc): results['exp_name'].append(params['id']) results['logs_type'].append(params['logs_type']) results['percentage'].append(percentage) results['pu_f1'].append(pu_acc) results[f"{params['binary_classifier']}_f1"].append(b_clf_acc) def run_test(params, x_data, y_data): results = init_results(params) # Binary training features y_data = binary_train_gtruth(y_data) x_data, y_data = force_ratio(params, x_data, y_data) print("total logs", len(y_data)) print(len(np.where(y_data == -1.0)[0]), " are unlabeled") print(len(np.where(y_data == 1.0)[0]), " are anomalous") # KFold Cross Validation kfold = StratifiedKFold(n_splits=params['kfold']).split(x_data, y_data) for train_index, test_index in kfold: x_train, x_test = x_data[train_index], x_data[test_index] y_train, y_test = y_data[train_index], y_data[test_index] x_train, _ = extract_features(x_train, params) with TestingParameters(params): x_test, _ = extract_features(x_test, params) np.random.seed(5) permut = np.random.permutation(len(y_train)) x_train = x_train[permut] y_train = y_train[permut] top_percentage = params['top_percentage'] step = params['step'] # Relabeling anomalous logs to unlabeled to test PU Learning Robustness for i in range(0, top_percentage, step): y_train_pu = np.copy(y_train) if i > 0: n_unlabeled = len(np.where(y_train_pu == -1.0)[0]) sacrifice_size = (i*n_unlabeled)//(100 - i) print(i, n_unlabeled, sacrifice_size) pos = np.where(y_train == 1.0)[0] np.random.shuffle(pos) sacrifice = pos[: sacrifice_size] y_train_pu[sacrifice] = -1.0 print(f"{i}% of anomalous log in unlabeled logs:") print(len(np.where(y_train_pu == -1.0)[0]), " are unlabeled") print(len(np.where(y_train_pu == 1.0)[0]), " are anomalous") # Binary PULearning with RF pu_getter =\ binary_classifier_registry.get_binary_model("pu_learning") binary_clf = pu_getter(params) binary_clf.fit(x_train, y_train_pu) y_pred_pu = binary_clf.predict(x_test) get_accuracy = black_box_report_registry.get_bb_report('acc') pu_acc = get_accuracy(y_test, y_pred_pu) # Comparing given Binary Classifier with PU Learning comparison_clf_getter =\ binary_classifier_registry.get_binary_model( params['binary_classifier']) binary_clf = comparison_clf_getter(params) binary_clf.fit(x_train, y_train_pu) y_pred = binary_clf.predict(x_test) b_clf_acc = get_accuracy(y_test, y_pred) print(f"PU Acc: {pu_acc}\n{params['binary_classifier']}" + " Acc: {b_clf_acc}") add_result( results, params, i, pu_acc, b_clf_acc ) save_results(results, params) def main(): # Init params params = parse_args(init_args()) print_params(params) file_handling(params) # Filter params from raw logs if "raw_logs" in params: preprocess = preprocess_registry.get_preprocessor(params['logs_type']) preprocess(params) # Load filtered params from file print('Loading logs') x_data, y_data, _ = load_logs(params) run_test(params, x_data, y_data) if __name__ == "__main__": main() ================================================ FILE: decorators.py ================================================ import functools # Borrowed from https://realpython.com/primer-on-python-decorators/ def debug(func): """Print the function signature and return value""" @functools.wraps(func) def wrapper_debug(*args, **kwargs): args_repr = [repr(a) for a in args] # 1 kwargs_repr = [f"{k}={v!r}" for k, v in kwargs.items()] # 2 signature = ", ".join(args_repr + kwargs_repr) # 3 print(f"Calling {func.__name__}({signature})") value = func(*args, **kwargs) print(f"{func.__name__!r} returned {value!r}") # 4 return value return wrapper_debug def print_step(func): """Print the function signature and return value""" @functools.wraps(func) def wrapper_print_name(*args, **kwargs): print(f"Calling {func.__qualname__}") value = func(*args, **kwargs) return value return wrapper_print_name ================================================ FILE: feature_engineering/__init__.py ================================================ __all__ = ["length", "tf_idf", "tf_ilf", "tf"] ================================================ FILE: feature_engineering/length.py ================================================ from .registry import register import numpy as np @register("length") def create_length_feature(params, input_vector, **kwargs): """ Returns an array of lengths of each tokenized log message from the input. Parameters ---------- params : dict of experiment parameters. input_vector : numpy Array vector of word indexes from each log message line. Returns ------- numpy array of lengths of each tokenized log message from the input with shape (number_of_logs, N). """ length = np.vectorize(len) length_feature = length(input_vector) length_feature = length_feature.reshape(-1, 1) return length_feature ================================================ FILE: feature_engineering/registry.py ================================================ """Basic registry for logs vector feature engineering. These take the log messages as input and extract and return a feature to be appended to the feature vector.""" _FEATURE_EXTRACTORS = dict() def register(name): """Registers a new log message feature extraction function under the given name.""" def add_to_dict(func): _FEATURE_EXTRACTORS[name] = func return func return add_to_dict def get_feature_extractor(feature): """Fetches the feature extraction function associated with the given raw logs""" return _FEATURE_EXTRACTORS[feature] ================================================ FILE: feature_engineering/tf.py ================================================ from .registry import register from .vectorizer import get_tf import numpy as np from .utils import save_feature_dict, load_feature_dict def create_tf_vector(input_vector, tf_dict, vocabulary): tf_vector = [] # Creating the idf/ilf vector for each log message for line in input_vector: cur_tf_vector = np.zeros(len(vocabulary)) for token_index in line: cur_tf_vector[token_index] = len(tf_dict[token_index]) tf_vector.append(cur_tf_vector) tf_vector = np.array(tf_vector) return tf_vector @register("tf") def create_term_count_feature(params, input_vector, **kwargs): """ Returns an array of the counts of each word per log message. """ if params['train']: tf_dict = get_tf(input_vector) save_feature_dict(params, tf_dict, "tf") else: tf_dict = load_feature_dict(params, "tf") tf_features =\ create_tf_vector(input_vector, tf_dict, kwargs['vocabulary']) return tf_features ================================================ FILE: feature_engineering/tf_idf.py ================================================ from .registry import register from .vectorizer import ( get_tf, calculate_idf, calculate_tf_invf_train, create_invf_vector, ) from .utils import save_feature_dict, load_feature_dict @register("tfidf") def create_tfidf_feature(params, train_vector, **kwargs): """ Returns the tf-idf matrix of features. """ if params['train']: invf_dict = calculate_tf_invf_train( train_vector, get_f=get_tf, calc_invf=calculate_idf ) save_feature_dict(params, invf_dict, "tfidf") else: invf_dict = load_feature_dict(params, "tfidf") features = create_invf_vector( train_vector, invf_dict, kwargs['vocabulary']) return features ================================================ FILE: feature_engineering/tf_ilf.py ================================================ from .registry import register from .vectorizer import ( get_lf, calculate_ilf, calculate_tf_invf_train, create_invf_vector, ) from .utils import save_feature_dict, load_feature_dict @register("tfilf") def create_tfilf_feature(params, train_vector, **kwargs): """ Returns the tf-ilf matrix of features. """ if params['train']: invf_dict = calculate_tf_invf_train( train_vector, get_f=get_lf, calc_invf=calculate_ilf ) save_feature_dict(params, invf_dict, "tfilf") else: invf_dict = load_feature_dict(params, "tfilf") features = create_invf_vector( train_vector, invf_dict, kwargs['vocabulary']) return features ================================================ FILE: feature_engineering/utils.py ================================================ import os import pickle import numpy as np from .vectorizer import log_to_vector, build_vocabulary from . import registry as feature_registry from ..decorators import print_step def load_feature_dict(params, name): dict_file = os.path.join(params['features_dir'], f"{name}.pkl") with open(dict_file, "rb") as fp: feat_dict = pickle.load(fp) return feat_dict def save_feature_dict(params, feat_dict, name): dict_file = os.path.join(params['features_dir'], f"{name}.pkl") with open(dict_file, "wb") as fp: pickle.dump(feat_dict, fp) def binary_train_gtruth(y): return np.where(y == -1.0, -1.0, 1.0) def multi_features(x, y): anomalous = (y != -1) x_multi, y_multi = x[anomalous], y[anomalous] return x_multi, y_multi @print_step def get_features_vector(log_vector, vocabulary, params): """ Extracts all specified features from the vectorized logs. For each feature specified in params it gets the feature function from the feature registry and applies to the data. A numpy array vector of shape (number_of_logs, N) is expected for each to be concatenated along the second axis. Parameters ---------- log_vector : numpy Array vector of word indexes from each log message line. vocabulary : dict mapping a word to an index. params : dict of experiment parameters. Returns ------- x_features : numpy ndArray of all specified features. """ feature_vectors = [] for feature in params['features']: extract_feature = feature_registry.get_feature_extractor(feature) feature_vector = extract_feature( params, log_vector, vocabulary=vocabulary) feature_vectors.append(feature_vector) X = np.hstack(feature_vectors) return X @print_step def extract_features(x, params): """ Gets vocabulary and specified features from the preprocessed logs. Creates a vocabulary from the preprocessed logs to vectorize each message. Extracts all specified features in params from the logs vector and vocabulary, then returns them both. Parameters ---------- x : list of preprocessed logs. One log message per line. params : dict of experiment parameters. Returns ------- x_features : numpy ndArray of all specified features. vocabulary : dict mapping a word to an index. """ # Build Vocabulary if params['train']: vocabulary = build_vocabulary(x) save_feature_dict(params, vocabulary, "vocab") else: vocabulary = load_feature_dict(params, "vocab") # Feature Engineering x_vector = log_to_vector(x, vocabulary) x_features = get_features_vector(x_vector, vocabulary, params) return x_features, vocabulary ================================================ FILE: feature_engineering/vectorizer.py ================================================ import numpy as np from ..decorators import print_step from collections import defaultdict, Counter def get_ngrams(n, line): line = line.strip().split() cur_len = len(line) ngrams_list = [] if cur_len == 0: # Token list is empty pass elif cur_len < n: # Token list fits in one ngram ngrams_list.append(" ".join(line)) else: # Token list spans multiple ngrams loop_num = cur_len - n + 1 for i in range(loop_num): cur_gram = " ".join(line[i: i + n]) ngrams_list.append(cur_gram) return ngrams_list def tokenize(line): return line.strip().split() @print_step def build_vocabulary(inputData): """ Divides log into tokens and creates vocabulary. Parameter --------- inputData: list of log message lines Returns ------- vocabulary : word to index dict """ vocabulary = {} for line in inputData: token_list = tokenize(line) for token in token_list: if token not in vocabulary: vocabulary[token] = len(vocabulary) return vocabulary @print_step def log_to_vector(inputData, vocabulary): """ Vectorizes each log message using a dict of words to index. Parameter --------- inputData: list of log message lines. vocabulary : word to index dict. Returns ------- numpy Array vector of word indexes from each log message line. """ result = [] for line in inputData: temp = [] token_list = tokenize(line) if token_list: for token in token_list: if token not in vocabulary: continue else: temp.append(vocabulary[token]) result.append(temp) return np.array(result) def setTrainDataForILF(x, y): x_res, indices = np.unique(x, return_index=True) y_res = y[indices] return x_res, y_res def calculate_inv_freq(total, num): return np.log(float(total) / float(num + 0.01)) def get_max_line(inputVector): return len(max(inputVector, key=len)) def get_tf(inputVector): token_index_dict = defaultdict(set) # Counting the number of logs the word appears in for index, line in enumerate(inputVector): for token in line: token_index_dict[token].add(index) return token_index_dict def get_lf(inputVector): token_index_ilf_dict = defaultdict(set) for line in inputVector: for location, token in enumerate(line): token_index_ilf_dict[token].add(location) return token_index_ilf_dict def calculate_idf(token_index_dict, inputVector): idf_dict = {} total_log_num = len(inputVector) for token in token_index_dict: idf_dict[token] = calculate_inv_freq(total_log_num, len(token_index_dict[token])) return idf_dict def calculate_ilf(token_index_dict, inputVector): ilf_dict = {} max_length = get_max_line(inputVector) # calculating ilf for each token for token in token_index_dict: ilf_dict[token] = calculate_inv_freq(max_length, len(token_index_dict[token])) return ilf_dict def create_invf_vector(inputVector, invf_dict, vocabulary): tfinvf = [] # Creating the idf/ilf vector for each log message for line in inputVector: cur_tfinvf = np.zeros(len(vocabulary)) count_dict = Counter(line) for token_index in line: cur_tfinvf[token_index] = ( float(count_dict[token_index]) * invf_dict[token_index] ) tfinvf.append(cur_tfinvf) tfinvf = np.array(tfinvf) return tfinvf def normalize_tfinvf(tfinvf): return 2.*(tfinvf - np.min(tfinvf))/np.ptp(tfinvf)-1 def calculate_tf_invf_train( inputVector, get_f=get_tf, calc_invf=calculate_idf ): token_index_dict = get_f(inputVector) invf_dict = calc_invf(token_index_dict, inputVector) return invf_dict ================================================ FILE: init_params.py ================================================ import argparse import os import sys import warnings from uuid import uuid4 def init_main_args(): """Init command line args used for configuration.""" parser = argparse.ArgumentParser( description="Runs experiment using LogClass Framework", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--raw_logs", metavar="raw_logs", type=str, nargs=1, help="input raw logs file path", ) base_dir_default = os.path.join( os.path.dirname(os.path.realpath(__file__)), "output" ) parser.add_argument( "--base_dir", metavar="base_dir", type=str, nargs=1, default=[base_dir_default], help="base output directory for pipeline output files", ) parser.add_argument( "--logs", metavar="logs", type=str, nargs=1, help="input logs file path and output for raw logs preprocessing", ) parser.add_argument( "--models_dir", metavar="models_dir", type=str, nargs=1, help="trained models input/output directory path", ) parser.add_argument( "--features_dir", metavar="features_dir", type=str, nargs=1, help="trained features_dir input/output directory path", ) parser.add_argument( "--logs_type", metavar="logs_type", type=str, nargs=1, default=["open_Apache"], choices=[ "bgl", "open_Apache", "open_bgl", "open_hadoop", "open_hdfs", "open_hpc", "open_proxifier", "open_zookeeper", ], help="Input type of logs.", ) parser.add_argument( "--kfold", metavar="kfold", type=int, nargs=1, help="kfold crossvalidation", ) parser.add_argument( "--healthy_label", metavar='healthy_label', type=str, nargs=1, default=["unlabeled"], help="the labels of unlabeled logs", ) parser.add_argument( "--features", metavar="features", type=str, nargs='+', default=["tfilf"], choices=["tfidf", "tfilf", "length", "tf"], help="Features to be extracted from the logs messages.", ) parser.add_argument( "--report", metavar="report", type=str, nargs='+', default=["confusion_matrix"], choices=["confusion_matrix", "acc", "multi_acc", "top_k_svm", "micro", "macro" ], help="Reports to be generated from the model and its predictions.", ) parser.add_argument( "--binary_classifier", metavar="binary_classifier", type=str, nargs=1, default=["pu_learning"], choices=["pu_learning", "regular"], help="Binary classifier to be used as anomaly detector.", ) parser.add_argument( "--multi_classifier", metavar="multi_classifier", type=str, nargs=1, default=["svm"], choices=["svm"], help="Multi-clas classifier to classify anomalies.", ) parser.add_argument( "--train", action="store_true", default=False, help="If set, logclass will train on the given data. Otherwise" + "it will run inference on it.", ) parser.add_argument( "--force", action="store_true", default=False, help="Force training overwriting previous output with same id.", ) parser.add_argument( "--id", metavar="id", type=str, nargs=1, help="Experiment id. Automatically generated if not specified.", ) parser.add_argument( "--swap", action="store_true", default=False, help="Swap testing/training data in kfold cross validation.", ) return parser def parse_main_args(args): """Parse provided args for runtime configuration.""" params = { "report": args.report, "train": args.train, "force": args.force, "base_dir": args.base_dir[0], "logs_type": args.logs_type[0], "healthy_label": args.healthy_label[0], "features": args.features, "binary_classifier": args.binary_classifier[0], "multi_classifier": args.multi_classifier[0], "swap": args.swap, } if args.raw_logs: params["raw_logs"] = os.path.normpath(args.raw_logs[0]) if args.kfold: params["kfold"] = args.kfold[0] if args.logs: params['logs'] = os.path.normpath(args.logs[0]) else: params['logs'] = os.path.join( params['base_dir'], "preprocessed_logs", f"{params['logs_type']}.txt" ) if args.id: params['id'] = args.id[0] else: if not params["train"]: warnings.warn( "--id parameter is not set when running inference." "If --train is not set, you might want to provide the" "experiment id of your best training experiment run," " E.g. `--id 2310136305`" ) params['id'] = str(uuid4().time_low) print(f"\nExperiment ID: {params['id']}") # Creating experiments results folder with the format # {experiment_module_name}_{logs_type}_{id} experiment_name = os.path.basename(sys.argv[0]).split('.')[0] params['id_dir'] = os.path.join( params['base_dir'], '_'.join(( experiment_name, params['logs_type'], params['id'] )) ) if args.models_dir: params['models_dir'] = os.path.normpath(args.models_dir[0]) else: params['models_dir'] = os.path.join( params['id_dir'], "models", ) if args.features_dir: params['features_dir'] = os.path.normpath(args.features_dir[0]) else: params['features_dir'] = os.path.join( params['id_dir'], "features", ) params['results_dir'] = os.path.join(params['id_dir'], "results") return params ================================================ FILE: logclass.py ================================================ from sklearn.model_selection import StratifiedKFold from .utils import ( save_params, load_params, file_handling, TestingParameters, print_params, ) from .preprocess import registry as preprocess_registry from .preprocess.utils import load_logs from .feature_engineering.utils import ( binary_train_gtruth, multi_features, extract_features, ) from tqdm import tqdm from .models import binary_registry as binary_classifier_registry from .models import multi_registry as multi_classifier_registry from .reporting import bb_registry as black_box_report_registry from .reporting import wb_registry as white_box_report_registry from .init_params import init_main_args, parse_main_args def init_args(): """Init command line args used for configuration.""" parser = init_main_args() return parser.parse_args() def parse_args(args): """Parse provided args for runtime configuration.""" params = parse_main_args(args) return params def inference(params, x_data, y_data, target_names): # Inference # Feature engineering x_test, vocabulary = extract_features(x_data, params) # Binary training features y_test = binary_train_gtruth(y_data) # Binary PU estimator with RF # Load Trained PU Estimator binary_clf_getter =\ binary_classifier_registry.get_binary_model( params['binary_classifier']) binary_clf = binary_clf_getter(params) binary_clf.load() # Anomaly detection y_pred_pu = binary_clf.predict(x_test) get_accuracy = black_box_report_registry.get_bb_report('acc') binary_acc = get_accuracy(y_test, y_pred_pu) # MultiClass remove healthy logs x_infer_multi, y_infer_multi = multi_features(x_test, y_data) # Load MultiClass multi_classifier_getter =\ multi_classifier_registry.get_multi_model(params['multi_classifier']) multi_classifier = multi_classifier_getter(params) multi_classifier.load() # Anomaly Classification pred = multi_classifier.predict(x_infer_multi) get_multi_acc = black_box_report_registry.get_bb_report('multi_acc') score = get_multi_acc(y_infer_multi, pred) print(binary_acc, score) for report in params['report']: try: get_bb_report = black_box_report_registry.get_bb_report(report) result = get_bb_report(y_test, y_pred_pu) except Exception: pass else: print(f'Binary classification {report} report:') print(result) try: get_bb_report = black_box_report_registry.get_bb_report(report) result = get_bb_report(y_infer_multi, pred) except Exception: pass else: print(f'Multi classification {report} report:') print(result) try: get_wb_report = white_box_report_registry.get_wb_report(report) result =\ get_wb_report(params, binary_clf.model, vocabulary, target_names=target_names, top_features=5) except Exception: pass else: print(f'Multi classification {report} report:') print(result) def train(params, x_data, y_data, target_names): # KFold Cross Validation kfold = StratifiedKFold(n_splits=params['kfold']).split(x_data, y_data) best_pu_fs = 0. best_multi = 0. for train_index, test_index in tqdm(kfold): x_train, x_test = x_data[train_index], x_data[test_index] y_train, y_test = y_data[train_index], y_data[test_index] x_train, vocabulary = extract_features(x_train, params) with TestingParameters(params): x_test, _ = extract_features(x_test, params) # Binary training features y_test_pu = binary_train_gtruth(y_test) y_train_pu = binary_train_gtruth(y_train) # Binary PULearning with RF binary_clf_getter =\ binary_classifier_registry.get_binary_model( params['binary_classifier']) binary_clf = binary_clf_getter(params) binary_clf.fit(x_train, y_train_pu) y_pred_pu = binary_clf.predict(x_test) get_accuracy = black_box_report_registry.get_bb_report('acc') binary_acc = get_accuracy(y_test_pu, y_pred_pu) # Multi-class training features x_train_multi, y_train_multi =\ multi_features(x_train, y_train) x_test_multi, y_test_multi = multi_features(x_test, y_test) # MultiClass multi_classifier_getter =\ multi_classifier_registry.get_multi_model(params['multi_classifier']) multi_classifier = multi_classifier_getter(params) multi_classifier.fit(x_train_multi, y_train_multi) pred = multi_classifier.predict(x_test_multi) get_multi_acc = black_box_report_registry.get_bb_report('multi_acc') score = get_multi_acc(y_test_multi, pred) better_results = ( binary_acc > best_pu_fs or (binary_acc == best_pu_fs and score > best_multi) ) if better_results: if binary_acc > best_pu_fs: best_pu_fs = binary_acc save_params(params) if score > best_multi: best_multi = score print(binary_acc, score) # TryCatch are necessary since I'm trying to consider all # reports the same when they are not for report in params['report']: try: get_bb_report = black_box_report_registry.get_bb_report(report) result = get_bb_report(y_test_pu, y_pred_pu) except Exception: pass else: print(f'Binary classification {report} report:') print(result) try: get_bb_report = black_box_report_registry.get_bb_report(report) result = get_bb_report(y_test_multi, pred) except Exception: pass else: print(f'Multi classification {report} report:') print(result) try: get_wb_report = white_box_report_registry.get_wb_report(report) result =\ get_wb_report(params, multi_classifier.model, vocabulary, target_names=target_names, top_features=5) except Exception: pass else: print(f'Multi classification {report} report:') print(result) def main(): # Init params params = parse_args(init_args()) if not params['train']: load_params(params) print_params(params) file_handling(params) # TODO: handle the case when the experiment ID already exists - this I think is the only one that matters # Filter params from raw logs if 'raw_logs' in params: preprocess = preprocess_registry.get_preprocessor(params['logs_type']) preprocess(params) # Load filtered params from file x_data, y_data, target_names = load_logs(params) if params['train']: train(params, x_data, y_data, target_names) else: inference(params, x_data, y_data, target_names) if __name__ == "__main__": main() ================================================ FILE: models/__init__.py ================================================ __all__ = ["regular", "svm", "pu_learning"] ================================================ FILE: models/base_model.py ================================================ from abc import ABC, abstractmethod from time import time from ..decorators import print_step class BaseModel(ABC): """ Abstract class used to wrap models and add further functionality. Attributes ---------- model : model that implements fit and predict functions as sklearn ML models do. params : dict of experiment parameters. name : str of the original model class name. train_time : time it took to run fit in seconds. run_time : time it took to run predict in seconds. Methods ------- save(self, **kwargs) Abstract method for the subclass to implement how the model is saved. Should use the experiment id as reference. load(self, **kwargs) Abstract method for the subclass to implement how it's meant to be loaded. Should correspond to how the save method saves the model. predict(self, X, **kwargs) Wraps original model predict and times its running time. fit(self, X, Y, **kwargs) Wraps original model fit, times fit running time and saves the model. """ def __init__(self, model, params): self.model = model self.params = params self.name = type(model).__name__ self.train_time = None self.run_time = None @abstractmethod def save(self, **kwargs): """ Abstract method for the subclass to implement how the model is saved. Should use the experiment id as reference. """ pass @abstractmethod def load(self, **kwargs): """ Abstract method for the subclass to implement how it's meant to be loaded. Should correspond to how the save method saves the model. """ pass @print_step def predict(self, X, **kwargs): """ Wraps original model predict and times its running time. """ t0 = time() pred = self.model.predict(X, **kwargs) t1 = time() lapse = t1 - t0 self.run_time = lapse print(f"{self.name} took {lapse}s to run inference.") return pred @print_step def fit(self, X, Y, **kwargs): """ Wraps original model fit, times fit running time and saves the model. """ t0 = time() self.model.fit(X, Y, **kwargs) t1 = time() lapse = t1 - t0 self.train_time = lapse print(f"{self.name} took {lapse}s to train.") self.save() ================================================ FILE: models/binary_registry.py ================================================ """Registry for binary models to be used for anomaly detection.""" _BINARY_MODELS = dict() def register(name): """Registers a new binary classification anomaly detection model.""" def add_to_dict(func): _BINARY_MODELS[name] = func return func return add_to_dict def get_binary_model(model): """Fetches the binary classification anomaly detection model.""" return _BINARY_MODELS[model] ================================================ FILE: models/multi_registry.py ================================================ """Registry for multi-class models to be used for anomaly classification.""" _MULTI_MODELS = dict() def register(name): """Registers a new multi-class anomaly classification model.""" def add_to_dict(func): _MULTI_MODELS[name] = func return func return add_to_dict def get_multi_model(model): """Fetches the multi-class anomaly classification model.""" return _MULTI_MODELS[model] ================================================ FILE: models/pu_learning.py ================================================ from .binary_registry import register from ..puLearning.puAdapter import PUAdapter from sklearn.ensemble import RandomForestClassifier from .base_model import BaseModel import os import pickle class PUAdapterWrapper(BaseModel): def __init__(self, model, params): super().__init__(model, params) def save(self, **kwargs): pu_estimator_file = os.path.join( self.params['models_dir'], "pu_estimator.pkl" ) pu_saver = {'estimator': self.model.estimator, 'c': self.model.c} with open(pu_estimator_file, 'wb') as pu_estimator_file: pickle.dump(pu_saver, pu_estimator_file) def load(self, **kwargs): pu_estimator_file = os.path.join( self.params['models_dir'], "pu_estimator.pkl" ) with open(pu_estimator_file, 'rb') as pu_estimator_file: pu_saver = pickle.load(pu_estimator_file) estimator = pu_saver['estimator'] pu_estimator = PUAdapter(estimator) pu_estimator.c = pu_saver['c'] pu_estimator.estimator_fitted = True self.model = pu_estimator @register("pu_learning") def instatiate_pu_adapter(params, **kwargs): """ Returns a RF adapted to do PU Learning wrapped by the PUAdapterWrapper. """ hparms = { 'n_estimators': 10, 'criterion': "entropy", 'bootstrap': True, 'n_jobs': -1, } hparms.update(kwargs) estimator = RandomForestClassifier(**hparms) wrapped_pu_estimator = PUAdapterWrapper(PUAdapter(estimator), params) return wrapped_pu_estimator ================================================ FILE: models/regular.py ================================================ from .binary_registry import register from sklearn.ensemble import RandomForestClassifier from .base_model import BaseModel import os import pickle class RegularClassifierWrapper(BaseModel): def __init__(self, model, params): super().__init__(model, params) def save(self, **kwargs): regular_file = os.path.join( self.params['models_dir'], "regular.pkl" ) with open(regular_file, 'wb') as regular_clf_file: pickle.dump(self.model, regular_clf_file) def load(self, **kwargs): regular_file = os.path.join( self.params['models_dir'], "regular.pkl" ) with open(regular_file, 'rb') as regular_clf_file: regular_classifier = pickle.load(regular_clf_file) self.model = regular_classifier @register("regular") def instatiate_regular_classifier(params, **kwargs): """ Returns a RF wrapped by the PU Learning Adapter. """ hparms = { 'n_estimators': 10, 'bootstrap': True, 'n_jobs': -1, } hparms.update(kwargs) wrapped_regular = RegularClassifierWrapper( RandomForestClassifier(**hparms), params) return wrapped_regular ================================================ FILE: models/svm.py ================================================ from .multi_registry import register from sklearn.svm import LinearSVC from .base_model import BaseModel import os import pickle class SVMWrapper(BaseModel): def __init__(self, model, params): super().__init__(model, params) def save(self, **kwargs): multi_file = os.path.join( self.params['models_dir'], "multi.pkl" ) with open(multi_file, 'wb') as multi_clf_file: pickle.dump(self.model, multi_clf_file) def load(self, **kwargs): multi_file = os.path.join( self.params['models_dir'], "multi.pkl" ) with open(multi_file, 'rb') as multi_clf_file: multi_classifier = pickle.load(multi_clf_file) self.model = multi_classifier @register("svm") def instatiate_svm(params, **kwargs): """ Returns a RF wrapped by the PU Learning Adapter. """ hparms = { 'penalty': "l2", 'dual': False, 'tol': 1e-1, } hparms.update(kwargs) wrapped_svm = SVMWrapper(LinearSVC(**hparms), params) return wrapped_svm ================================================ FILE: preprocess/__init__.py ================================================ __all__ = [ "bgl_preprocessor", "open_source_logs", ] ================================================ FILE: preprocess/bgl_preprocessor.py ================================================ from .registry import register from .utils import process_logs, remove_parameters import re recid_regx = re.compile(r"^(\d+)") separator = re.compile(r"(?:-.{1,3}){2} (.+)$") msg_split_regx = re.compile(r"x'.+'") severity = re.compile(r"(\w+)\s+(INFO|WARN|ERROR|FATAL)") def process_line(line): line = line.strip() sep = separator.search(line) if sep: msg = sep.group(1).strip().split(' ')[-1].strip() msg = msg_split_regx.split(msg)[-1].strip() error_label = severity.search(line) recid = recid_regx.search(line) if recid and error_label and len(msg) > 20: # recid = recid.group(1).strip() We may want to use it later general_label = error_label.group(2) label = error_label.group(1) if general_label == 'WARN': return '' if general_label == 'INFO': # or label == 'WARN': label = 'unlabeled' msg = remove_parameters(msg) if msg: msg = ' '.join((label, msg)) msg = ''.join((msg, '\n')) return msg return '' @register("bgl") def preprocess_dataset(params): """ Runs BGL logs preprocessing executor. """ input_source = params['raw_logs'] output = params['logs'] params['healthy_label'] = 'unlabeled' process_logs(input_source, output, process_line) ================================================ FILE: preprocess/open_source_logs.py ================================================ import os from multiprocessing import Pool from tqdm import tqdm from .registry import register from .utils import remove_parameters def process_line(line): label = line[0].strip() msg = " ".join(line[1].strip().split()[1:]) msg = remove_parameters(msg) if msg: msg = " ".join((label, msg)) msg = "".join((msg, "\n")) return msg return "" def process_open_source(input_source, output): with open(output, "w", encoding="latin-1") as f: gtruth = os.path.join(input_source, "groundtruth.seq") rawlog = os.path.join(input_source, "rawlog.log") with open(gtruth, "r", encoding="latin-1") as IN: line_count = sum(1 for line in IN) with open(gtruth, "r", encoding="latin-1") as in_gtruth: with open(rawlog, "r", encoding="latin-1") as in_log: IN = zip(in_gtruth, in_log) with Pool() as pool: results = pool.imap(process_line, IN, chunksize=10000) f.writelines(tqdm(results, total=line_count)) open_source_datasets = [ "open_Apache", "open_bgl", "open_hadoop", "open_hdfs", "open_hpc", "open_proxifier", "open_zookeeper", ] for dataset in open_source_datasets: @register(dataset) def preprocess_dataset(params): """ Runs open source logs preprocessing executor. """ input_source = params["raw_logs"] output = params["logs"] params["healthy_label"] = "NA" process_open_source(input_source, output) ================================================ FILE: preprocess/registry.py ================================================ """Basic registry for logs preprocessors. These read the rawlog file and outputs filtered logs removing parameter words or tokens with non-letter characters keeping only text words.""" _PREPROCESSORS = dict() def register(name): """Registers a new logs preprocessor function under the given name.""" def add_to_dict(func): _PREPROCESSORS[name] = func return func return add_to_dict def get_preprocessor(data_src): """Fetches the logs preprocessor function associated with the given raw logs""" return _PREPROCESSORS[data_src] ================================================ FILE: preprocess/utils.py ================================================ import re import numpy as np from tqdm import tqdm from ..decorators import print_step from multiprocessing import Pool # Compiling for optimization re_sub_1 = re.compile(r"(:(?=\s))|((?<=\s):)") re_sub_2 = re.compile(r"(\d+\.)+\d+") re_sub_3 = re.compile(r"\d{2}:\d{2}:\d{2}") re_sub_4 = re.compile(r"Mar|Apr|Dec|Jan|Feb|Nov|Oct|May|Jun|Jul|Aug|Sep") re_sub_5 = re.compile(r":?(\w+:)+") re_sub_6 = re.compile(r"\.|\(|\)|\<|\>|\/|\-|\=|\[|\]") p = re.compile(r"[^(A-Za-z)]") def remove_parameters(msg): # Removing parameters with Regex msg = re.sub(re_sub_1, "", msg) msg = re.sub(re_sub_2, "", msg) msg = re.sub(re_sub_3, "", msg) msg = re.sub(re_sub_4, "", msg) msg = re.sub(re_sub_5, "", msg) msg = re.sub(re_sub_6, " ", msg) L = msg.split() # Filtering strings that have non-letter tokens new_msg = [k for k in L if not p.search(k)] msg = " ".join(new_msg) return msg def remove_parameters_slower(msg): # Removing parameters with Regex msg = re.sub(r"(:(?=\s))|((?<=\s):)", "", msg) msg = re.sub(r"(\d+\.)+\d+", "", msg) msg = re.sub(r"\d{2}:\d{2}:\d{2}", "", msg) msg = re.sub(r"Mar|Apr|Dec|Jan|Feb|Nov|Oct|May|Jun|Jul|Aug|Sep", "", msg) msg = re.sub(r":?(\w+:)+", "", msg) msg = re.sub(r"\.|\(|\)|\<|\>|\/|\-|\=|\[|\]", " ", msg) L = msg.split() p = re.compile("[^(A-Za-z)]") # Filtering strings that have non-letter tokens new_msg = [k for k in L if not p.search(k)] msg = " ".join(new_msg) return msg @print_step def process_logs(input_source, output, process_line=None): with open(output, "w", encoding='latin-1') as f: # counting first to show progress with tqdm with open(input_source, 'r', encoding='latin-1') as IN: line_count = sum(1 for line in IN) with open(input_source, 'r', encoding='latin-1') as IN: with Pool() as pool: results = pool.imap(process_line, IN, chunksize=10000) f.writelines(tqdm(results, total=line_count)) @print_step def load_logs(params, ignore_unlabeled=False): log_path = params['logs'] unlabel_label = params['healthy_label'] x_data = [] y_data = [] label_dict = {} target_names = [] with open(log_path, 'r', encoding='latin-1') as IN: line_count = sum(1 for line in IN) with open(log_path, 'r', encoding='latin-1') as IN: for line in tqdm(IN, total=line_count): L = line.strip().split() label = L[0] if label not in label_dict: if ignore_unlabeled and label == unlabel_label: continue if label == unlabel_label: label_dict[label] = -1.0 elif label not in label_dict: label_dict[label] = len(label_dict) target_names.append(label) x_data.append(" ".join(L[1:])) y_data.append(label_dict[label]) x_data = np.array(x_data) y_data = np.array(y_data) return x_data, y_data, target_names ================================================ FILE: puLearning/__init__.py ================================================ ================================================ FILE: puLearning/puAdapter.py ================================================ #!/usr/bin/python # -*- coding: UTF-8 -*- # ********************************************************** # * Author : Weibin Meng # * Email : m_weibin@163.com # * Create time : 2019-07-24 15:10 # * Last modified : 2019-07-24 15:10 # * Filename : puAdapter.py # * Description : ''' ''' # ********************************************************** #!/usr/bin/env python #-*- coding:utf-8 -*- """ Created on Dec 21, 2012 @author: Alexandre """ import numpy as np class PUAdapter(object): """ Adapts any probabilistic binary classifier to positive-unlabled learning using the PosOnly method proposed by Elkan and Noto: Elkan, Charles, and Keith Noto. \"Learning classifiers from only positive and unlabeled data.\" Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2008. """ def __init__(self, estimator, hold_out_ratio=0.1, precomputed_kernel=False): """ estimator -- An estimator of p(s=1|x) that must implement: * predict_proba(X): Takes X, which can be a list of feature vectors or a precomputed kernel matrix and outputs p(s=1|x) for each example in X * fit(X,y): Takes X, which can be a list of feature vectors or a precomputed kernel matrix and takes y, which are the labels associated to the examples in X hold_out_ratio -- The ratio of training examples that must be held out of the training set of examples to estimate p(s=1|y=1) after training the estimator precomputed_kernel -- Specifies if the X matrix for predict_proba and fit is a precomputed kernel matrix """ self.estimator = estimator self.c = 1.0 self.hold_out_ratio = hold_out_ratio if precomputed_kernel: self.fit = self.__fit_precomputed_kernel else: self.fit = self.__fit_no_precomputed_kernel self.estimator_fitted = False def __str__(self): return 'Estimator:' + str(self.estimator) + '\n' + 'p(s=1|y=1,x) ~= ' + str(self.c) + '\n' + \ 'Fitted: ' + str(self.estimator_fitted) def __fit_precomputed_kernel(self, X, y): """ Fits an estimator of p(s=1|x) and estimates the value of p(s=1|y=1) using a subset of the training examples X -- Precomputed kernel matrix y -- Labels associated to each example in X (Positive label: 1.0, Negative label: -1.0) """ positives = np.where(y == 1.)[0] hold_out_size = np.ceil(len(positives) * self.hold_out_ratio) if len(positives) <= hold_out_size: raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.') np.random.shuffle(positives) hold_out = positives[:hold_out_size] #Hold out test kernel matrix X_test_hold_out = X[hold_out] keep = list(set(np.arange(len(y))) - set(hold_out)) X_test_hold_out = X_test_hold_out[:,keep] #New training kernel matrix X = X[:, keep] X = X[keep] y = np.delete(y, hold_out) self.estimator.fit(X, y) hold_out_predictions = self.estimator.predict_proba(X_test_hold_out) try: hold_out_predictions = hold_out_predictions[:,1] except: pass c = np.mean(hold_out_predictions) self.c = c self.estimator_fitted = True def __fit_no_precomputed_kernel(self, X, y): """ Fits an estimator of p(s=1|x) and estimates the value of p(s=1|y=1,x) X -- List of feature vectors y -- Labels associated to each feature vector in X (Positive label: 1.0, Negative label: -1.0) """ positives = np.where(y == 1.)[0] hold_out_size = np.ceil(len(positives) * self.hold_out_ratio) if len(positives) <= hold_out_size: raise('Not enough positive examples to estimate p(s=1|y=1,x). Need at least ' + str(hold_out_size + 1) + '.') np.random.shuffle(positives) #print hold_out_size #print type(hold_out_size) hold_out = positives[:int(hold_out_size)] X_hold_out = X[hold_out] X = np.delete(X, hold_out,0) y = np.delete(y, hold_out) self.estimator.fit(X, y) hold_out_predictions = self.estimator.predict_proba(X_hold_out) try: hold_out_predictions = hold_out_predictions[:,1] except: pass c = np.mean(hold_out_predictions) self.c = c self.estimator_fitted = True def predict_proba(self, X): """ Predicts p(y=1|x) using the estimator and the value of p(s=1|y=1) estimated in fit(...) X -- List of feature vectors or a precomputed kernel matrix """ if not self.estimator_fitted: raise Exception('The estimator must be fitted before calling predict_proba(...).') probabilistic_predictions = self.estimator.predict_proba(X) try: probabilistic_predictions = probabilistic_predictions[:,1] except: pass return probabilistic_predictions / self.c def predict(self, X, treshold=0.5): """ Assign labels to feature vectors based on the estimator's predictions X -- List of feature vectors or a precomputed kernel matrix treshold -- The decision treshold between the positive and the negative class """ if not self.estimator_fitted: raise Exception('The estimator must be fitted before calling predict(...).') return np.array([1. if p > treshold else -1. for p in self.predict_proba(X)]) ================================================ FILE: reporting/__init__.py ================================================ __all__ = [ "accuracy", "confusion_matrix", "multi_class_acc", "top_k_svm", "microf1", "macrof1", ] ================================================ FILE: reporting/accuracy.py ================================================ from .bb_registry import register from sklearn.metrics import f1_score @register('acc') def model_accuracy(y, pred): return f1_score(y, pred) ================================================ FILE: reporting/bb_registry.py ================================================ """Registry for black box reports or metrics.""" _BB_REPORTS = dict() def register(name): """Registers a new black box report or metric function.""" def add_to_dict(func): _BB_REPORTS[name] = func return func return add_to_dict def get_bb_report(model): """Fetches the black box report or metric function.""" return _BB_REPORTS[model] ================================================ FILE: reporting/confusion_matrix.py ================================================ from .bb_registry import register from sklearn.metrics import confusion_matrix @register('confusion_matrix') def report(y, pred): return confusion_matrix(y, pred) ================================================ FILE: reporting/macrof1.py ================================================ from .bb_registry import register from sklearn.metrics import f1_score @register('macro') def model_accuracy(y, pred): return f1_score(y, pred, average='macro') ================================================ FILE: reporting/microf1.py ================================================ from .bb_registry import register from sklearn.metrics import f1_score @register('micro') def model_accuracy(y, pred): return f1_score(y, pred, average='micro') ================================================ FILE: reporting/multi_class_acc.py ================================================ from .bb_registry import register from sklearn.metrics import accuracy_score @register('multi_acc') def model_accuracy(y, pred): return accuracy_score(y, pred) ================================================ FILE: reporting/top_k_svm.py ================================================ from .wb_registry import register import numpy as np def get_feature_names(params, vocabulary, add_length=True): feature_names = zip(vocabulary.keys(), vocabulary.values()) feature_names = sorted(feature_names, key=lambda x: x[1]) feature_names = [x[0] for x in feature_names] if 'length' in params['features']: feature_names.append('LENGTH') return np.array(feature_names) @register('top_k_svm') def get_top_k_SVM_features(params, model, vocabulary, **kwargs): hparms = { 'target_names': [], 'top_features': 5, } hparms.update(kwargs) top_k_label = {} feature_names = get_feature_names(params, vocabulary) for i, label in enumerate(hparms['target_names']): if len(hparms['target_names']) < 3 and i == 1: break # coef is unidemensional when there's only two labels coef = model.coef_[i] top_coefficients = np.argsort(coef)[-hparms['top_features']:] top_k_features = feature_names[top_coefficients] top_k_label[label] = list(reversed(top_k_features)) return top_k_label ================================================ FILE: reporting/wb_registry.py ================================================ """Registry for white box reports or metrics.""" _WB_REPORTS = dict() def register(name): """Registers a new white box report or metric function.""" def add_to_dict(func): _WB_REPORTS[name] = func return func return add_to_dict def get_wb_report(model): """Fetches the white box report or metric function.""" return _WB_REPORTS[model] ================================================ FILE: requirements.txt ================================================ certifi==2019.9.11 joblib==0.14.0 numpy==1.17.4 pandas==0.25.3 python-dateutil==2.8.1 pytz==2019.3 scikit-learn==0.21.3 scipy==1.3.3 six==1.13.0 sklearn==0.0 tqdm==4.39.0 wincertstore==0.2 ================================================ FILE: run_binary.py ================================================ from .utils import ( load_params, file_handling, print_params, ) from .preprocess import registry as preprocess_registry from .preprocess.utils import load_logs from .feature_engineering.utils import ( binary_train_gtruth, extract_features, ) from .models import binary_registry as binary_classifier_registry from .reporting import bb_registry as black_box_report_registry from .init_params import init_main_args, parse_main_args def init_args(): """Init command line args used for configuration.""" parser = init_main_args() return parser.parse_args() def parse_args(args): """Parse provided args for runtime configuration.""" params = parse_main_args(args) params.update({'train': False}) return params def inference(params, x_data, y_data, target_names): # Inference # Feature engineering x_test, _ = extract_features(x_data, params) # Binary training features y_test = binary_train_gtruth(y_data) # Binary PU estimator with RF # Load Trained PU Estimator binary_clf_getter =\ binary_classifier_registry.get_binary_model( params['binary_classifier']) binary_clf = binary_clf_getter(params) binary_clf.load() # Anomaly detection y_pred_pu = binary_clf.predict(x_test) get_accuracy = black_box_report_registry.get_bb_report('acc') binary_acc = get_accuracy(y_test, y_pred_pu) print(binary_acc) for report in params['report']: try: get_bb_report = black_box_report_registry.get_bb_report(report) result = get_bb_report(y_test, y_pred_pu) except Exception: pass else: print(f'Binary classification {report} report:') print(result) def main(): # Init params params = parse_args(init_args()) load_params(params) print_params(params) file_handling(params) # Filter params from raw logs if "raw_logs" in params: preprocess = preprocess_registry.get_preprocessor(params['logs_type']) preprocess(params) # Load filtered params from file print('Loading logs') x_data, y_data, target_names = load_logs(params) inference(params, x_data, y_data, target_names) if __name__ == "__main__": main() ================================================ FILE: train_binary.py ================================================ from sklearn.model_selection import StratifiedKFold from .utils import ( save_params, file_handling, TestingParameters, print_params, ) from .preprocess import registry as preprocess_registry from .preprocess.utils import load_logs from .feature_engineering.utils import ( binary_train_gtruth, extract_features, ) from tqdm import tqdm from .models import binary_registry as binary_classifier_registry from .reporting import bb_registry as black_box_report_registry from .init_params import init_main_args, parse_main_args def init_args(): """Init command line args used for configuration.""" parser = init_main_args() return parser.parse_args() def parse_args(args): """Parse provided args for runtime configuration.""" params = parse_main_args(args) params.update({'train': True}) return params def train(params, x_data, y_data, target_names): # KFold Cross Validation kfold = StratifiedKFold(n_splits=params['kfold']).split(x_data, y_data) best_pu_fs = 0. for train_index, test_index in tqdm(kfold): x_train, x_test = x_data[train_index], x_data[test_index] y_train, y_test = y_data[train_index], y_data[test_index] x_train, _ = extract_features(x_train, params) with TestingParameters(params): x_test, _ = extract_features(x_test, params) # Binary training features y_test_pu = binary_train_gtruth(y_test) y_train_pu = binary_train_gtruth(y_train) # Binary PULearning with RF binary_clf_getter =\ binary_classifier_registry.get_binary_model( params['binary_classifier']) binary_clf = binary_clf_getter(params) binary_clf.fit(x_train, y_train_pu) y_pred_pu = binary_clf.predict(x_test) get_accuracy = black_box_report_registry.get_bb_report('acc') binary_acc = get_accuracy(y_test_pu, y_pred_pu) better_results = binary_acc > best_pu_fs if better_results: if binary_acc > best_pu_fs: best_pu_fs = binary_acc save_params(params) binary_clf.save() print(binary_acc) for report in params['report']: try: get_bb_report = black_box_report_registry.get_bb_report(report) result = get_bb_report(y_test_pu, y_pred_pu) except Exception: pass else: print(f'Binary classification {report} report:') print(result) def main(): # Init params params = parse_args(init_args()) file_handling(params) # Filter params from raw logs if "raw_logs" in params: preprocess = preprocess_registry.get_preprocessor(params['logs_type']) preprocess(params) # Load filtered params from file print('Loading logs') x_data, y_data, target_names = load_logs(params) print_params(params) train(params, x_data, y_data, target_names) if __name__ == "__main__": main() ================================================ FILE: train_multi.py ================================================ from sklearn.model_selection import StratifiedKFold from .utils import ( save_params, file_handling, TestingParameters, print_params, save_results, ) from .preprocess import registry as preprocess_registry from .preprocess.utils import load_logs from .feature_engineering.utils import ( multi_features, extract_features, ) from tqdm import tqdm from .models import multi_registry as multi_classifier_registry from .reporting import bb_registry as black_box_report_registry from .init_params import init_main_args, parse_main_args def init_args(): """Init command line args used for configuration.""" parser = init_main_args() return parser.parse_args() def parse_args(args): """Parse provided args for runtime configuration.""" params = parse_main_args(args) params.update({'train': True}) return params def init_results(): results = { 'exp_name': [], 'logs_type': [], 'macro': [], 'micro': [], 'train_time': [], 'run_time': [], } return results def add_result(results, params, macro, micro, train_time, run_time): results['exp_name'].append(params['id']) results['logs_type'].append(params['logs_type']) results['macro'].append(macro) results['micro'].append(micro) results['train_time'].append(train_time) results['run_time'].append(run_time) def train(params, x_data, y_data, target_names): results = init_results() # KFold Cross Validation kfold = StratifiedKFold(n_splits=params['kfold']).split(x_data, y_data) best_multi = 0. for train_index, test_index in tqdm(kfold): # Test & Train are interchanged to enable testing with 10% of the data if params['swap']: x_test, x_train = x_data[train_index], x_data[test_index] y_test, y_train = y_data[train_index], y_data[test_index] else: x_train, x_test = x_data[train_index], x_data[test_index] y_train, y_test = y_data[train_index], y_data[test_index] x_train, _ = extract_features(x_train, params) print(y_train.shape, y_test.shape) with TestingParameters(params): x_test, _ = extract_features(x_test, params) # Multi-class training features x_train_multi, y_train_multi =\ multi_features(x_train, y_train) x_test_multi, y_test_multi = multi_features(x_test, y_test) # MultiClass multi_classifier_getter =\ multi_classifier_registry.get_multi_model(params['multi_classifier']) multi_classifier = multi_classifier_getter(params) multi_classifier.fit(x_train_multi, y_train_multi) pred = multi_classifier.predict(x_test_multi) get_multi_acc = black_box_report_registry.get_bb_report('macro') macro = get_multi_acc(y_test_multi, pred) get_multi_acc = black_box_report_registry.get_bb_report('micro') micro = get_multi_acc(y_test_multi, pred) better_results = macro > best_multi if better_results: save_params(params) best_multi = macro print(macro) add_result( results, params, macro, micro, multi_classifier.train_time, multi_classifier.run_time ) save_results(results, params) def main(): # Init params params = parse_args(init_args()) print_params(params) file_handling(params) # Filter params from raw logs if "raw_logs" in params: preprocess = preprocess_registry.get_preprocessor(params['logs_type']) preprocess(params) # Load filtered params from file x_data, y_data, target_names = load_logs(params) train(params, x_data, y_data, target_names) if __name__ == "__main__": main() ================================================ FILE: utils.py ================================================ import os import json import shutil import pandas as pd # trim is only used when showing the top keywords for each class def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." class TestingParameters(): def __init__(self, params): self.params = params self.original_state = params['train'] def __enter__(self): self.params['train'] = False def __exit__(self, exc_type, exc_value, traceback): self.params['train'] = self.original_state def load_params(params): params_file = os.path.join( params['id_dir'], f"best_params.json") with open(params_file, "r") as fp: best_params = json.load(fp) params.update(best_params) def save_params(params): params_file = os.path.join( params['id_dir'], f"best_params.json") with open(params_file, "w") as fp: json.dump(params, fp) def file_handling(params): if "raw_logs" in params: if not os.path.exists(params['raw_logs']): raise FileNotFoundError( f"File {params['raw_logs']} doesn't exist. " + "Please provide the raw logs path." ) logs_directory = os.path.dirname(params['logs']) if not os.path.exists(logs_directory): os.makedirs(logs_directory) else: # Checks if preprocessed logs exist as input if not os.path.exists(params['logs']): raise FileNotFoundError( f"File {params['base_dir']} doesn't exist. " + "Preprocess target logs first and provide their path." ) if params['train']: # Checks if the experiment id already exists if os.path.exists(params["id_dir"]) and not params["force"]: raise FileExistsError( f"directory '{params['id_dir']} already exists. " + "Run with --force to overwrite." + f"If --force is used, you could lose your training results." ) if os.path.exists(params["id_dir"]): shutil.rmtree(params["id_dir"]) for target_dir in ['id_dir', 'models_dir', 'features_dir']: os.makedirs(params[target_dir]) else: # Checks if input models and features are provided for concern in ['models_dir', 'features_dir']: target_path = params[concern] if not os.path.exists(target_path): raise FileNotFoundError( "directory '{} doesn't exist. ".format(target_path) + "Run train first before running inference." ) def print_params(params): print("{:-^80}".format("params")) print("Beginning experiment using the following configuration:\n") for param, value in params.items(): print("\t{:>13}: {}".format(param, value)) print() print("-" * 80) def save_results(results, params): df = pd.DataFrame(results) file_name = os.path.join( params['id_dir'], "results.csv", ) df.to_csv(file_name, index=False)