Repository: Michedev/VAE_anomaly_detection
Branch: master
Commit: fa2fb6a3d44c
Files: 14
Total size: 37.9 KB

Directory structure:
gitextract_si3hjwri/

├── .github/
│   └── workflows/
│       ├── python-publish-on-release.yml
│       └── python-test-build.yml
├── .gitignore
├── .projectignore
├── dataset.py
├── model/
│   ├── VAE.py
│   ├── VAE_tf1.py
│   ├── __init__.py
│   └── encoder_decoder.py
├── pyproject.toml
├── readme.md
├── tests/
│   ├── __init__.py
│   └── test_pytorch_model.py
└── train.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/python-publish-on-release.yml
================================================

name: Python release on pypi


on:
  release:
    types: [published]
  workflow_dispatch:


jobs:
  build:

    runs-on: ubuntu-20.04
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v3
        with:
          python-version: "3.10"
      - name: Assemble python package folder
        run: |
            mv model/ vae_anomaly_detection/
      - name: Install pypa/hatch
        run: pip install hatch
      - name: Build a binary wheel and a source tarball
        run: hatch build
      - name: Publish distribution 📦 to PyPI
        run: hatch publish
        env:
          HATCH_INDEX_USER: __token__
          HATCH_INDEX_AUTH: ${{ secrets.PYPI_PASSWORD }}


================================================
FILE: .github/workflows/python-test-build.yml
================================================

name: Python test and build

on:
  push:
    tags-ignore:
      - "*"
  schedule:
    - cron: "0 0 * * 0" # Run every Sunday at midnight
  workflow_dispatch: # Manually trigger a workflow run
    

jobs:
  ci:
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
        os: [ubuntu-20.04]
    runs-on: ${{ matrix.os }}
    env:
      HATCH_ENV: test
    steps:
      - uses: actions/checkout@v2
      - uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install Hatch 🥚
        run: pip install hatch
      - name: Install dependencies
        run: hatch env create test
      -  name: Test with pytest
         run: hatch run test:pytest
      - name: rename folder
        run: mv model/ vae_anomaly_detection/
      - name: Build package 📦
        run: hatch build

================================================
FILE: .gitignore
================================================
envs/
.vscode/
.idea
dist/
poetry.lock


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/


================================================
FILE: .projectignore
================================================
# This file contains a list of match patterns that instructs
# anaconda-project to exclude certain files or directories when
# building a project archive. The file format is a simplfied
# version of Git's .gitignore file format. In fact, if the
# project is hosted in a Git repository, these patterns can be
# merged into the .gitignore file and this file removed.
# See the anaconda-project documentation for more details.

# Python caching
*.pyc
*.pyd
*.pyo
__pycache__/

# Jupyter & Spyder stuff
.ipynb_checkpoints/
.Trash-*/
/.spyderproject


================================================
FILE: dataset.py
================================================
from typing import Tuple
import torch
from torch.utils.data import Dataset, TensorDataset
from torchvision.datasets import MNIST

def rand_dataset(num_rows=60_000, num_columns=100) -> Dataset:
    return TensorDataset(torch.rand(num_rows, num_columns))


def mnist_dataset(train=True) -> Dataset:
    """
    Returns the MNIST dataset for training or testing.
    
    Args:
    train (bool): If True, returns the training dataset. Otherwise, returns the testing dataset.
    
    Returns:
    Dataset: The MNIST dataset.
    """
    return MNIST(root='./data', train=train, download=True, transform=None)


================================================
FILE: model/VAE.py
================================================
from abc import abstractmethod, ABC

import torch
from torch import nn
from torch.distributions import Normal, kl_divergence
from torch.nn.functional import softplus
import pytorch_lightning as pl


class VAEAnomalyDetection(pl.LightningModule, ABC):
    """
    Variational Autoencoder (VAE) for anomaly detection. The model learns a low-dimensional representation of the input
    data using an encoder-decoder architecture, and uses the learned representation to detect anomalies.

    The model is trained to minimize the Kullback-Leibler (KL) divergence between the learned distribution of the latent
    variables and the prior distribution (a standard normal distribution). It is also trained to maximize the likelihood
    of the input data under the learned distribution.

    This implementation uses PyTorch Lightning to simplify training and improve reproducibility.
    """

    def __init__(self, input_size: int, latent_size: int, L: int = 10, lr: float = 1e-3, log_steps: int = 1_000):
        """
        Initializes the VAEAnomalyDetection model.

        Args:
            input_size (int): Number of input features.
            latent_size (int): Size of the latent space.
            L (int, optional): Number of samples in the latent space to detect the anomaly. Defaults to 10.
            lr (float, optional): Learning rate. Defaults to 1e-3.
            log_steps (int, optional): Number of steps between each logging. Defaults to 1_000.
        """
        super().__init__()
        self.L = L
        self.lr = lr
        self.input_size = input_size
        self.latent_size = latent_size
        self.encoder = self.make_encoder(input_size, latent_size)
        self.decoder = self.make_decoder(latent_size, input_size)
        self.prior = Normal(0, 1)
        self.log_steps = log_steps

    @abstractmethod
    def make_encoder(self, input_size: int, latent_size: int) -> nn.Module:
        """
        Abstract method to create the encoder network.

        Args:
            input_size (int): Number of input features.
            latent_size (int): Size of the latent space.

        Returns:
            nn.Module: Encoder network.
        """
        pass

    @abstractmethod
    def make_decoder(self, latent_size: int, output_size: int) -> nn.Module:
        """
        Abstract method to create the decoder network.

        Args:
            latent_size (int): Size of the latent space.
            output_size (int): Number of output features.

        Returns:
            nn.Module: Decoder network.
        """
        pass

    def forward(self, x: torch.Tensor) -> dict:
        """
        Computes the forward pass of the model and returns the loss and other relevant information.

        Args:
            x (torch.Tensor): Input data. Shape [batch_size, num_features].

        Returns:
            Dictionary containing:
            - loss: Total loss.
            - kl: KL-divergence loss.
            - recon_loss: Reconstruction loss.
            - recon_mu: Mean of the reconstructed input.
            - recon_sigma: Standard deviation of the reconstructed input.
            - latent_dist: Distribution of the latent space.
            - latent_mu: Mean of the latent space.
            - latent_sigma: Standard deviation of the latent space.
            - z: Sampled latent space.

        """
        pred_result = self.predict(x)
        x = x.unsqueeze(0)  # unsqueeze to broadcast input across sample dimension (L)
        log_lik = Normal(pred_result['recon_mu'], pred_result['recon_sigma']).log_prob(x).mean(
            dim=0)  # average over sample dimension
        log_lik = log_lik.mean(dim=0).sum()
        kl = kl_divergence(pred_result['latent_dist'], self.prior).mean(dim=0).sum()
        loss = kl - log_lik
        return dict(loss=loss, kl=kl, recon_loss=log_lik, **pred_result)

    def predict(self, x) -> dict:
        """
        Compute the output of the VAE. Does not compute the loss compared to the forward method.

        Args:
            x: Input tensor of shape [batch_size, input_size].

        Returns:
            Dictionary containing:
            - latent_dist: Distribution of the latent space.
            - latent_mu: Mean of the latent space.
            - latent_sigma: Standard deviation of the latent space.
            - recon_mu: Mean of the reconstructed input.
            - recon_sigma: Standard deviation of the reconstructed input.
            - z: Sampled latent space.

        """
        batch_size = len(x)
        latent_mu, latent_sigma = self.encoder(x).chunk(2, dim=1) #both with size [batch_size, latent_size]
        latent_sigma = softplus(latent_sigma)
        dist = Normal(latent_mu, latent_sigma)
        z = dist.rsample([self.L])  # shape: [L, batch_size, latent_size]
        z = z.view(self.L * batch_size, self.latent_size)
        recon_mu, recon_sigma = self.decoder(z).chunk(2, dim=1)
        recon_sigma = softplus(recon_sigma)
        recon_mu = recon_mu.view(self.L, *x.shape)
        recon_sigma = recon_sigma.view(self.L, *x.shape)
        return dict(latent_dist=dist, latent_mu=latent_mu,
                    latent_sigma=latent_sigma, recon_mu=recon_mu,
                    recon_sigma=recon_sigma, z=z)

    def is_anomaly(self, x: torch.Tensor, alpha: float = 0.05) -> torch.Tensor:
        """
        Determines if input samples are anomalous based on a given threshold.
        
        Args:
            x: Input tensor of shape (batch_size, num_features).
            alpha: Anomaly threshold. Values with probability lower than alpha are considered anomalous.
        
        Returns:
            A binary tensor of shape (batch_size,) where `True` represents an anomalous sample and `False` represents a 
            normal sample.
        """
        p = self.reconstructed_probability(x)
        return p < alpha

    def reconstructed_probability(self, x: torch.Tensor) -> torch.Tensor:
        """
        Computes the probability density of the input samples under the learned
        distribution of reconstructed data.

        Args:
            x: Input data tensor of shape (batch_size, num_features).

        Returns:
            A tensor of shape (batch_size,) containing the probability densities of
            the input samples under the learned distribution of reconstructed data.
        """
        with torch.no_grad():
            pred = self.predict(x)
        recon_dist = Normal(pred['recon_mu'], pred['recon_sigma'])
        x = x.unsqueeze(0)
        p = recon_dist.log_prob(x).exp().mean(dim=0).mean(dim=-1)  # vector of shape [batch_size]
        return p

    def generate(self, batch_size: int = 1) -> torch.Tensor:
        """
        Generates a batch of samples from the learned prior distribution.

        Args:
            batch_size: Number of samples to generate.

        Returns:
            A tensor of shape (batch_size, num_features) containing the generated
            samples.
        """
        z = self.prior.sample((batch_size, self.latent_size))
        recon_mu, recon_sigma = self.decoder(z).chunk(2, dim=1)
        recon_sigma = softplus(recon_sigma)
        return recon_mu + recon_sigma * torch.rand_like(recon_sigma)
    
    
    def training_step(self, batch, batch_idx):
        x = batch
        loss = self.forward(x)
        if self.global_step % self.log_steps == 0:
            self.log('train/loss', loss['loss'])
            self.log('train/loss_kl', loss['kl'], prog_bar=False)
            self.log('train/loss_recon', loss['recon_loss'], prog_bar=False)
            self._log_norm()

        return loss
    

    def validation_step(self, batch, batch_idx):
        x = batch
        loss = self.forward(x)
        self.log('val/loss_epoch', loss['loss'], on_epoch=True)
        self.log('val_kl', loss['kl'], self.global_step)
        self.log('val_recon_loss', loss['recon_loss'], self.global_step)

        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)


    def _log_norm(self):
        norm1 = sum(p.norm(1) for p in self.parameters())
        norm1_grad = sum(p.grad.norm(1) for p in self.parameters() if p.grad is not None)
        self.log('norm1_params', norm1)
        self.log('norm1_grad', norm1_grad)

class VAEAnomalyTabular(VAEAnomalyDetection):

    def make_encoder(self, input_size, latent_size):
        """
        Simple encoder for tabular data.
        If you want to feed image to a VAE make another encoder function with Conv2d instead of Linear layers.
        :param input_size: number of input variables
        :param latent_size: number of output variables i.e. the size of the latent space since it's the encoder of a VAE
        :return: The untrained encoder model
        """
        return nn.Sequential(
            nn.Linear(input_size, 500),
            nn.ReLU(),
            nn.Linear(500, 200),
            nn.ReLU(),
            nn.Linear(200, latent_size * 2)
            # times 2 because this is the concatenated vector of latent mean and variance
        )

    def make_decoder(self, latent_size, output_size):
        """
        Simple decoder for tabular data.
        :param latent_size: size of input latent space
        :param output_size: number of output parameters. Must have the same value of input_size
        :return: the untrained decoder
        """
        return nn.Sequential(
            nn.Linear(latent_size, 200),
            nn.ReLU(),
            nn.Linear(200, 500),
            nn.ReLU(),
            nn.Linear(500, output_size * 2)  # times 2 because this is the concatenated vector of reconstructed mean and variance
        )


================================================
FILE: model/VAE_tf1.py
================================================
# ========== Legacy code ===============


from math import ceil

import numpy as np
import tensorflow as tf
from scipy.stats import multivariate_normal


def tf_namespace(namespace):
    def wrapper(f):
        def wrapped_f(*args, **kwargs):
            with tf.name_scope(namespace):
                return f(*args, **kwargs)

        return wrapped_f

    return wrapper


class VAE:

    def __init__(self, input_shape, encode_sizes, latent_size, decode_sizes=None, mu_prior=None, sigma_prior=None,
                 lr=10e-4,  momentum=0.9, save_model=True):
        self.encode_sizes = encode_sizes
        self.latent_size = latent_size
        self.decode_sizes = decode_sizes or encode_sizes[::-1]
        self.mu_prior = mu_prior or np.zeros([latent_size], dtype='float32')
        self.sigma_prior = sigma_prior or np.ones([latent_size], 'float32')
        self.lr = lr
        self.momentum = momentum
        self.input_shape = input_shape
        self.save_model = save_model
        self._build_graph(input_shape, latent_size)

    def _build_graph(self, input_shape, latent_size):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self._create_placeholders(input_shape)
            self._create_encoder(self.X)
            self._create_latent_distribution(self.encoder, latent_size)
            self._create_decoder(self.z)
            self.loss = - self.elbo(self.X, self.decoder, self.mu, self.log_sigma_square, self.sigma_square,
                                    tf.constant(self.mu_prior), tf.constant(self.sigma_prior))
            self.opt = tf.train.AdamOptimizer(self.lr, self.momentum)
            self.opt_op = self.opt.minimize(self.loss)
            self.session = tf.InteractiveSession(graph=self.graph)
        writer = tf.summary.FileWriter(logdir='logdir', graph=self.graph)
        writer.flush()

    @property
    def k_init(self):
        return {'kernel_initializer': tf.glorot_uniform_initializer()}

    def elbo(self, X_true, X_pred, mu, log_sigma, sigma, mu_prior, sigma_prior):
        epsilon = tf.constant(0.000001)
        self.mae = tf.losses.absolute_difference(X_true, X_pred, reduction=tf.losses.Reduction.NONE)
        self.mae_sum = tf.reduce_sum(self.mae, axis=1)
        log_sigma_prior = tf.log(sigma_prior + epsilon)
        mu_diff = mu - mu_prior
        self.kl = log_sigma_prior - log_sigma - 1 + (sigma + tf.multiply(mu_diff, mu_diff)) / sigma_prior
        self.kl_sum = tf.reduce_sum(self.kl, axis=1)
        return tf.reduce_mean(- self.mae_sum - self.kl_sum)

    @tf_namespace('placeholders')
    def _create_placeholders(self, input_shape):
        self.X = tf.placeholder(tf.float32, shape=[None, *input_shape], name='X')

    @tf_namespace('encoder')
    def _create_encoder(self, X):
        self.encode_layers = []
        self.encoder = X
        for i, lsize in enumerate(self.encode_sizes):
            self.encoder = tf.layers.dense(self.encoder, lsize, **self.k_init,
                                           activation=tf.nn.relu, name=f'encoder_{i + 1}')
            self.encode_layers.append(self.encoder)
            setattr(self, f'encoder_{i + 1}', self.encoder)

    @tf_namespace('latent')
    def _create_latent_distribution(self, encoder, latent_dim):
        self.mu = tf.layers.dense(encoder, latent_dim, **self.k_init, name='mu')
        self.log_sigma_square = tf.layers.dense(encoder, latent_dim,
                                                **self.k_init, name='log_sigma_square')
        self.sigma_square = tf.exp(self.log_sigma_square, 'sigma_square')
        self.z = tf.add(self.mu, self.sigma_square * tf.random.normal(tf.shape(self.sigma_square)), 'z')

    @tf_namespace('decoder')
    def _create_decoder(self, z):
        self.decoder = z
        self.decode_layers = []
        for i, lsize in enumerate(self.decode_sizes):
            self.decoder = tf.layers.dense(self.decoder, lsize, **self.k_init,
                                           activation=tf.nn.relu, name=f'decoder_{i + 1}')
            setattr(self, f'decoder_{i + 1}', self.decoder)
            self.decode_layers.append(self.decoder)
            if i == len(self.decode_sizes) - 1:
                self.mu_post = tf.layers.dense(self.decoder, self.input_shape[0], name='mu_posterior')
                self.log_sigma_post = tf.layers.dense(self.decoder, self.input_shape[0])
                self.sigma_post = tf.exp(self.log_sigma_post, 'sigma_square_posterior')
                self.decoder = tf.add(self.mu_post,
                                      self.sigma_post * tf.random.normal((self.input_shape[0],), name='eps_post'),
                                      name='decoder_output')
                setattr(self, f'decoder_{i + 2}', self.decoder)
                self.decode_layers.append(self.decoder)
        return self.decoder

    @property
    def layers(self):
        return [(f'encoder_{i}', getattr(self, f'encoder_{i}')) for i in range(1, len(self.encode_layers) + 1)] + \
               [('mu', self.mu), ('sigma', self.log_sigma_square), ('z', self.z)] + \
               [(f'decoder_{i}', getattr(self, f'decoder_{i}')) for i in range(1, len(self.decode_layers) + 1)]

    def fit(self, X, epochs, batch_size, print_every=50, save_every_epochs=5, verbose=True):
        n_batch = ceil(X.shape[0] / batch_size)
        if self.save_model:
            saver = tf.train.Saver()
        self.session.run(tf.global_variables_initializer())
        for epoch in range(1, epochs + 1):
            np.random.shuffle(X)
            acc_loss = 0
            counter = 0
            for i in range(n_batch):
                slice_batch = slice(i * batch_size, (i + 1) * batch_size) if i != n_batch - 1 else slice(
                    i * batch_size,
                    None)
                X_batch = X[slice_batch, :]
                batch_loss, _ = self.session.run([self.loss, self.opt_op], {self.X: X_batch})
                acc_loss += batch_loss
                if verbose and counter % print_every == 0:
                    print(f" Epoch {epoch} - batch {i} - neg_ELBO = {batch_loss}")
                counter += 1
            if verbose:
                print(f'\nEpoch {epoch} - Avg loss = {acc_loss / n_batch}')
                print('\n' + ('-' * 70))
            if self.save_model and (epoch+1) % save_every_epochs == 0:
                saver.save(self.session, "ckpts/ad_vae.ckpt")

    def generate(self, n=1, mu_prior=None, sigma_prior=None):
        """
        Generate new examples sampling from the latent distribution
        :param n: number of examples to generate
        :param mu_prior:
        :param sigma_prior:
        :return: a matrix of size [n, p] where p is the number of variables of X_train
        """
        if mu_prior is None:
            mu_prior = self.mu_prior
        if sigma_prior is None:
            sigma_prior = self.sigma_prior
        z = np.random.multivariate_normal(mu_prior, np.diag(sigma_prior), [n])
        return self.session.run(self.decoder, feed_dict={self.z: z})

    def reconstruct(self, X):
        return self.session.run(self.decoder, feed_dict={self.X: X})

    def reconstructed_probability(self, X, L=100):
        reconstructed_prob = np.zeros((X.shape[0],), dtype='float32')
        mu_hat, sigma_hat = self.session.run([self.mu_post, self.sigma_post], {self.X: X})
        for l in range(L):
            mu_hat = mu_hat.reshape(X.shape)
            sigma_hat = sigma_hat.reshape(X.shape) + 0.00001
            for i in range(X.shape[0]):
                p_l = multivariate_normal.pdf(X[i, :], mu_hat[i, :], np.diag(sigma_hat[i, :]))
                reconstructed_prob[i] += p_l
        reconstructed_prob /= L
        return reconstructed_prob

    def is_outlier(self, X, L=100, alpha=0.05):
        p_hat = self.reconstructed_probability(X, L)
        return p_hat < alpha

    def open(self):
        if not hasattr(self, 'session') or self.session is None:
            if self.graph is None:
                self._build_graph(self.input_shape, self.latent_size)
            else:
                self.session = tf.InteractiveSession(graph=self.graph)

    def close(self):
        if hasattr(VAE, 'session') and VAE.session is not None:
            VAE.session.close()
            VAE.session = None

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def __delete__(self, instance):
        self.close()

    def __setattr__(self, key, value):
        if key == 'session':
            if hasattr(self, 'session') and self.session is not None:
                self.close()
            VAE.session = value
        else:
            self.__dict__[key] = value

    def __delattr__(self, item):
        if item == 'session':
            self.close()
            del VAE.__dict__['session']
        else:
            del self.__dict__[item]

    def __enter__(self):
        self.open()

================================================
FILE: model/__init__.py
================================================
from .VAE import VAEAnomalyDetection, VAEAnomalyTabular

================================================
FILE: model/encoder_decoder.py
================================================
"""
This module contains simple encoder and decoder for tabular data.
For your own data you need to create your own encoder and decoder.
However the input and output of your encoder and decoder must be the same of the ones in this module.
"""

from torch import nn

def tabular_encoder(input_size: int, latent_size: int):
    """
    Simple encoder for tabular data.
    If you want to feed image to a VAE make another encoder function with Conv2d instead of Linear layers.
    
    Parameters
    ----------
    input_size : int
        number of input variables. In case of tabular data it's the number of columns.
    latent_size : int
        number of output variables i.e. the size of the latent space since it's the encoder of a VAE

    Returns
    -------
    The untrained encoder model
    
    """
    return nn.Sequential(
        nn.Linear(input_size, 500),
        nn.ReLU(),
        nn.Linear(500, 200),
        nn.ReLU(),
        nn.Linear(200, latent_size * 2)  # times 2 because this is the concatenated vector of latent mean and variance
    )


def tabular_decoder(latent_size: int, output_size: int):
    """
    Simple decoder for tabular data.

    Parameters
    ----------
    latent_size : int
        size of input latent space
    output_size : int
        number of output parameters. Must have the same value of input_size of the encoder

    Returns
    -------
    The untrained decoder
    """
    return nn.Sequential(
        nn.Linear(latent_size, 200),
        nn.ReLU(),
        nn.Linear(200, 500),
        nn.ReLU(),
        nn.Linear(500, output_size * 2)
        # times 2 because this is the concatenated vector of reconstructed mean and variance
    )


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"


[project]
name = "vae_anomaly_detection"
version = "2.0.1"
requires-python = ">3.6,<3.12"
description = "Pytorch/TF1 implementation of Variational AutoEncoder for anomaly detection following the paper \"Variational Autoencoder based Anomaly Detection using Reconstruction Probability by Jinwon An, Sungzoon Cho\""
authors = [{name="Michele De Vita", email="mik3dev@gmail.com"}]

classifiers = [
    "Development Status :: 5 - Production/Stable",
    "Intended Audience :: Developers",
    "Intended Audience :: Science/Research",
    "License :: OSI Approved :: MIT License",
    "Operating System :: OS Independent",
    "Programming Language :: Python",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.7",
    "Programming Language :: Python :: 3.8",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Topic :: Scientific/Engineering :: Artificial Intelligence",
    "Topic :: Software Development :: Libraries :: Python Modules"
    ]
keywords = ["vae", "anomaly detection", "deep learning", "pytorch"]
license = {text = "MIT"}
readme = "readme.md"

dependencies = [
"path>=15.0",
"torch>=1.8",
"pytorch-lightning>=1.9",
"PyYAML>=5.0",
"tqdm>=4.0",
"tensorboard>=0.20",
"numpy>= 1.18",
]

[project.urls]
homepage = "https://github.com/Michedev/VAE_anomaly_detection"
repository = "https://github.com/Michedev/VAE_anomaly_detection"

[project.optional-dependencies]
dev = [
    "pytest",
]

[tool.hatch.envs.default]
python = "3.10"
dependencies = [
    "torch>=1.8",
    "pytorch-lightning",
    "path",
    "tensorboard",
    "numpy",
    "torchvision",
]

[tool.hatch.envs.default.scripts]
train = "python train.py -i 100 -l 32 {args:train}"


[tool.hatch.envs.cpu]
python = "3.10"
dependencies = [
    "torch>=1.8",
    "pytorch-lightning",
    "path",
    "tensorboard",
    "numpy",
    "torchvision",
]

[tool.hatch.envs.cpu.env-vars]
PIP_EXTRA_INDEX_URL = "https://download.pytorch.org/whl/cpu"


[tool.hatch.envs.test]
python = "python3"
dependencies = [
    "torch>=1.8",
    "pytorch-lightning",
    "path",
    "tensorboard",
    "numpy",
    "torchvision",
    "pytest",
]

[tool.hatch.envs.test.overrides]
matrix.foo.set-python = ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11"]


[tool.hatch.envs.test.env-vars]
PIP_EXTRA_INDEX_URL = "https://download.pytorch.org/whl/cpu"


[tool.hatch.build]
include = ["vae_anomaly_detection"]

================================================
FILE: readme.md
================================================
# Variational autoencoder for anomaly detection

![PyPI](https://img.shields.io/pypi/v/vae-anomaly-detection?style=flat-square)
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/vae-anomaly-detection?style=flat-square)
![PyPI - License](https://img.shields.io/pypi/l/vae-anomaly-detection?style=flat-square)
![PyPI - Downloads](https://img.shields.io/pypi/dm/vae-anomaly-detection?style=flat-square)

Pytorch/TF1 implementation of Variational AutoEncoder for anomaly detection following the paper
 [Variational Autoencoder based Anomaly Detection using Reconstruction Probability by Jinwon An, Sungzoon Cho](https://www.semanticscholar.org/paper/Variational-Autoencoder-based-Anomaly-Detection-An-Cho/061146b1d7938d7a8dae70e3531a00fceb3c78e8)
 <br>

## How to install

#### Python package way
 _pip_ package containing the model and training_step only 
   
    pip install vae-anomaly-detection


#### Hack this repository


   a. Clone the repo

    git clone git@github.com:Michedev/VAE_anomaly_detection.git

   b. Install hatch

    pip install hatch

   c. Make the environment with torch gpu support

    hatch env create
      
or with cpu support

    hatch env create cpu

   d. Run the train

    hatch run train

or in cpu
          
    hatch run cpu:train

   To know all the train parameters run `hatch run train --help`


This version contains the model and the training procedure

## How To Train your Model

- Define your dataset into dataset.py and overwrite the line `train_set = rand_dataset()  # set here your dataset` in `train.py`
- Subclass VAEAnomalyDetection and define the methods `make_encoder` and `make_decoder`. The output of `make_encoder` should be a flat vector while the output of `make_decoder should have the same shape of the input.
## Make your model

Subclass ```VAEAnomalyDetection``` and define your encoder and decoder like in ```VaeAnomalyTabular```

```python
class VAEAnomalyTabular(VAEAnomalyDetection):

    def make_encoder(self, input_size, latent_size):
        """
        Simple encoder for tabular data.
        If you want to feed image to a VAE make another encoder function with Conv2d instead of Linear layers.
        :param input_size: number of input variables
        :param latent_size: number of output variables i.e. the size of the latent space since it's the encoder of a VAE
        :return: The untrained encoder model
        """
        return nn.Sequential(
            nn.Linear(input_size, 500),
            nn.ReLU(),
            nn.Linear(500, 200),
            nn.ReLU(),
            nn.Linear(200, latent_size * 2)
            # times 2 because this is the concatenated vector of latent mean and variance
        )

    def make_decoder(self, latent_size, output_size):
        """
        Simple decoder for tabular data.
        :param latent_size: size of input latent space
        :param output_size: number of output parameters. Must have the same value of input_size
        :return: the untrained decoder
        """
        return nn.Sequential(
            nn.Linear(latent_size, 200),
            nn.ReLU(),
            nn.Linear(200, 500),
            nn.ReLU(),
            nn.Linear(500, output_size * 2)  # times 2 because this is the concatenated vector of reconstructed mean and variance
        )
```

## How to make predictions:
Once the model is trained (suppose for simplicity that it is under _saved_models/{train-datetime}/_ ) just load and predict with this code snippet:
```python
import torch

#load X_test
model = VaeAnomalyTabular.load_checkpoint('saved_models/2022-01-06_15-12-23/last.ckpt')
# load saved parameters from a run
outliers = model.is_anomaly(X_test)
```


## train.py help

        usage: train.py [-h] --input-size INPUT_SIZE --latent-size LATENT_SIZE
                        [--num-resamples NUM_RESAMPLES] [--epochs EPOCHS] [--batch-size BATCH_SIZE]
                        [--device {cpu,gpu,tpu}] [--lr LR] [--no-progress-bar]
                        [--steps-log-loss STEPS_LOG_LOSS]
                        [--steps-log-norm-params STEPS_LOG_NORM_PARAMS]

        options:
        -h, --help            show this help message and exit
        --input-size INPUT_SIZE, -i INPUT_SIZE
                                Number of input features. In 1D case it is the vector length, in 2D
                                case it is the number of channels
        --latent-size LATENT_SIZE, -l LATENT_SIZE
                                Size of the latent space
        --num-resamples NUM_RESAMPLES, -L NUM_RESAMPLES
                                Number of resamples in the latent distribution during training
        --epochs EPOCHS, -e EPOCHS
                                Number of epochs to train for
        --batch-size BATCH_SIZE, -b BATCH_SIZE
        --device {cpu,gpu,tpu}, -d {cpu,gpu,tpu}, --accelerator {cpu,gpu,tpu}
                                Device to use for training. Can be cpu, gpu or tpu
        --lr LR               Learning rate
        --no-progress-bar
        --steps-log-loss STEPS_LOG_LOSS
                                Number of steps between each loss logging
        --steps-log-norm-params STEPS_LOG_NORM_PARAMS
                                Number of steps between each model parameters logging


================================================
FILE: tests/__init__.py
================================================


================================================
FILE: tests/test_pytorch_model.py
================================================
import torch
from model import VAEAnomalyTabular


def test_pytorch_anomaly_detection():
    batch_size = 32
    input_size = 100
    latent_size = 32
    model = VAEAnomalyTabular(input_size, latent_size, L=2)
    batch = torch.rand(batch_size, input_size)
    batch_anomaly = model.is_anomaly(batch, alpha=0.05)
    assert batch_anomaly.shape == (batch_size,)
    assert batch_anomaly.dtype == torch.bool


def test_pytorch_prediction():
    batch_size = 32
    input_size = 100
    latent_size = 32
    model = VAEAnomalyTabular(input_size, latent_size, L=2)
    batch = torch.rand(batch_size, input_size)
    reconstructed_probability = model.reconstructed_probability(batch)
    assert reconstructed_probability.shape == (batch_size,)
    assert reconstructed_probability.dtype == torch.float
    assert 1.0 >= reconstructed_probability.max().item() and \
           reconstructed_probability.min().item() >= 0.0
    

def test_training_step():
    batch_size = 32
    input_size = 100
    latent_size = 32
    model = VAEAnomalyTabular(input_size, latent_size, L=2)
    batch = torch.rand(batch_size, input_size)
    loss_dict = model.training_step(batch, batch_idx=0)
    loss = loss_dict['loss']
    assert loss.numel() == 1
    assert loss.dtype == torch.float    


================================================
FILE: train.py
================================================
import argparse
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
import torch
import yaml
from path import Path
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

from model.VAE import VAEAnomalyTabular
from dataset import rand_dataset

ROOT = Path(__file__).parent
SAVED_MODELS = ROOT / 'saved_models'


def make_folder_run() -> Path:
    """
    Get the folder where to store the experiment. 
    The folder is named with the current date and time.
    
    Returns:
        Path: the path to the folder where to store the experiment
    """
    checkpoint_folder = SAVED_MODELS / datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    checkpoint_folder.makedirs_p()
    return checkpoint_folder


def get_args() -> argparse.Namespace:
    """
    Parse command line arguments
    
    Returns:
        argparse.Namespace: the parsed arguments
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-size', '-i', type=int, required=True, dest='input_size', help='Number of input features. In 1D case it is the vector length, in 2D case it is the number of channels')
    parser.add_argument('--latent-size', '-l', type=int, required=True, dest='latent_size', help='Size of the latent space')
    parser.add_argument('--num-resamples', '-L', type=int, dest='num_resamples', default=10,
                        help='Number of resamples in the latent distribution during training')
    parser.add_argument('--epochs', '-e', type=int, dest='epochs', default=100, help='Number of epochs to train for')
    parser.add_argument('--batch-size', '-b', type=int, dest='batch_size', default=32)
    parser.add_argument('--device', '-d', '--accelerator', type=str, dest='device', default='gpu', help='Device to use for training. Can be cpu, gpu or tpu', choices=['cpu', 'gpu', 'tpu'])
    parser.add_argument('--lr', type=float, dest='lr', default=1e-3, help='Learning rate')
    parser.add_argument('--no-progress-bar', action='store_true', dest='no_progress_bar')
    parser.add_argument('--steps-log-loss', type=int, dest='steps_log_loss', default=1_000, help='Number of steps between each loss logging')
    parser.add_argument('--steps-log-norm-params', type=int, 
                        dest='steps_log_norm_params', default=1_000, help='Number of steps between each model parameters logging')

    return parser.parse_args()


def main():
    """
    Main function to train the VAE model
    """
    args = get_args()
    print(args)
    experiment_folder = make_folder_run()

    # copy model folder into experiment folder
    ROOT.joinpath('model').copytree(experiment_folder / 'model')

    with open(experiment_folder / 'config.yaml', 'w') as f:
        yaml.dump(args, f)

    model = VAEAnomalyTabular(args.input_size, args.latent_size, args.num_resamples, lr=args.lr)

    train_set = rand_dataset()  # set here your dataset
    train_dloader = DataLoader(train_set, args.batch_size)

    val_dataset = rand_dataset()  # set here your dataset
    val_dloader = DataLoader(val_dataset, args.batch_size)

    checkpoint = ModelCheckpoint(
        filepath=experiment_folder / '{epoch:02d}-{val_loss:.2f}',
        save_top_k=1,
        verbose=True,
        monitor='val_loss',
        mode='min',
        prefix='',
        save_last=True,
    )

    trainer = Trainer(callbacks=[checkpoint],)
    trainer.fit(model, train_dloader, val_dloader)


if __name__ == '__main__':
    main()