[
  {
    "path": ".github/FUNDING.yml",
    "content": "github: [vc1492a]\n"
  },
  {
    "path": ".github/workflows/tests.yml",
    "content": "# This workflow will install Python dependencies, run tests and lint with a variety of Python versions\n# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python\n\nname: tests\n\non:\n  push:\n    branches: [ \"main\", \"dev\" ]\n  pull_request:\n    branches: [ \"main\", \"dev\" ]\n\njobs:\n  test:\n\n    runs-on: ubuntu-latest\n    strategy:\n      fail-fast: false\n      matrix:\n        python-version: [\"3.8\", \"3.9\", \"3.10\", \"3.11\", \"3.12\", \"3.13\"]\n\n    steps:\n    - uses: actions/checkout@v4\n    - name: Set up Python ${{ matrix.python-version }}\n      uses: actions/setup-python@v3\n      with:\n        python-version: ${{ matrix.python-version }}\n    - name: Install dependencies\n      run: |\n        python -m pip install --upgrade pip\n        python -m pip install flake8 pytest\n        pip install -r requirements.txt\n        pip install -r requirements_ci.txt\n    - name: Lint with flake8\n      run: |\n        # stop the build if there are Python syntax errors or undefined names\n        flake8 . --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics\n        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide\n        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics\n    - name: Test with pytest\n      run: |\n        pytest --cov=PyNomaly\n"
  },
  {
    "path": ".gitignore",
    "content": "*.DS_STORE\n.idea/\n__pycache__/\n*.csv\nnasaValve\nrel_research\nPyNomaly/loop_dev.py\n/PyNomaly.egg-info/\n*.pyc\n*.coverage.*\n.coveragerc\n.pypirc\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# poetry\n#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.\n#   This is especially recommended for binary packages to ensure reproducibility, and is more\n#   commonly ignored for libraries.\n#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control\n#poetry.lock\n\n# pdm\n#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.\n#pdm.lock\n#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it\n#   in version control.\n#   https://pdm.fming.dev/#use-with-ide\n.pdm.toml\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# PyCharm\n#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can\n#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore\n#  and can be added to the global gitignore or merged into this file.  For a more nuclear\n#  option (not recommended) you can uncomment the following to ignore the entire idea folder.\n#.idea/\n\n"
  },
  {
    "path": "LICENSE",
    "content": "Copyright 2017 Valentino Constantinou.\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License."
  },
  {
    "path": "PyNomaly/__init__.py",
    "content": "# Authors: Valentino Constantinou <vc@valentino.io>\n# License: Apache 2.0\n\nfrom PyNomaly.loop import (\n    LocalOutlierProbability,\n    PyNomalyError,\n    ValidationError,\n    ClusterSizeError,\n    MissingValuesError,\n)\n\n__all__ = [\n    \"LocalOutlierProbability\",\n    \"PyNomalyError\",\n    \"ValidationError\",\n    \"ClusterSizeError\",\n    \"MissingValuesError\",\n]\n"
  },
  {
    "path": "PyNomaly/loop.py",
    "content": "from math import erf, sqrt\nimport numpy as np\nfrom python_utils.terminal import get_terminal_size\nimport sys\nfrom typing import Tuple, Union\nimport warnings\n\ntry:\n    import numba\nexcept ImportError:\n    pass\n\n__author__ = \"Valentino Constantinou\"\n__version__ = \"0.3.5\"\n__license__ = \"Apache License, Version 2.0\"\n\n\n# Custom Exceptions\nclass PyNomalyError(Exception):\n    \"\"\"Base exception for PyNomaly.\"\"\"\n    pass\n\n\nclass ValidationError(PyNomalyError):\n    \"\"\"Raised when input validation fails.\"\"\"\n    pass\n\n\nclass ClusterSizeError(ValidationError):\n    \"\"\"Raised when cluster size is smaller than n_neighbors.\"\"\"\n    pass\n\n\nclass MissingValuesError(ValidationError):\n    \"\"\"Raised when data contains missing values.\"\"\"\n    pass\n\n\nclass Utils:\n    @staticmethod\n    def emit_progress_bar(progress: str, index: int, total: int) -> str:\n        \"\"\"\n        A progress bar that is continuously updated in Python's standard\n        out.\n        :param progress: a string printed to stdout that is updated and later\n        returned.\n        :param index: the current index of the iteration within the tracked\n        process.\n        :param total: the total length of the tracked process.\n        :return: progress string.\n        \"\"\"\n\n        w, h = get_terminal_size()\n        sys.stdout.write(\"\\r\")\n        if total < w:\n            block_size = int(w / total)\n        else:\n            block_size = int(total / w)\n        if index % block_size == 0:\n            progress += \"=\"\n        percent = index / total\n        sys.stdout.write(\"[ %s ] %.2f%%\" % (progress, percent * 100))\n        sys.stdout.flush()\n        return progress\n\n\nclass LocalOutlierProbability(object):\n    \"\"\"\n    :param data: a Pandas DataFrame or Numpy array of float data\n    :param extent: an integer value [1, 2, 3] that controls the statistical \n    extent, e.g. lambda times the standard deviation from the mean (optional, \n    default 3)\n    :param n_neighbors: the total number of neighbors to consider w.r.t. each \n    sample (optional, default 10)\n    :param cluster_labels: a numpy array of cluster assignments w.r.t. each \n    sample (optional, default None)\n    :return:\n    \"\"\" \"\"\"\n\n    Based on the work of Kriegel, Kröger, Schubert, and Zimek (2009) in LoOP: \n    Local Outlier Probabilities.\n    ----------\n\n    References\n    ----------\n    .. [1] Breunig M., Kriegel H.-P., Ng R., Sander, J. LOF: Identifying \n           Density-based Local Outliers. ACM SIGMOD\n           International Conference on Management of Data (2000).\n    .. [2] Kriegel H.-P., Kröger P., Schubert E., Zimek A. LoOP: Local Outlier \n           Probabilities. 18th ACM conference on \n           Information and knowledge management, CIKM (2009).\n    .. [3] Goldstein M., Uchida S. A Comparative Evaluation of Unsupervised \n           Anomaly Detection Algorithms for Multivariate Data. PLoS ONE 11(4):\n           e0152173 (2016).\n    .. [4] Hamlet C., Straub J., Russell M., Kerlin S. An incremental and \n           approximate local outlier probability algorithm for intrusion \n           detection and its evaluation. Journal of Cyber Security Technology \n           (2016). \n    \"\"\"\n\n    \"\"\"\n    Validation methods.\n    These methods validate inputs and raise exceptions or warnings as appropriate.\n    \"\"\"\n\n    @staticmethod\n    def _convert_to_array(obj: Union[\"pd.DataFrame\", np.ndarray]) -> np.ndarray:\n        \"\"\"\n        Converts the input data to a numpy array if it is a Pandas DataFrame\n        or validates it is already a numpy array.\n        :param obj: user-provided input data.\n        :return: a vector of values to be used in calculating the local\n        outlier probability.\n        \"\"\"\n        if obj.__class__.__name__ == \"DataFrame\":\n            points_vector = obj.values\n            return points_vector\n        elif obj.__class__.__name__ == \"ndarray\":\n            points_vector = obj\n            return points_vector\n        else:\n            warnings.warn(\n                \"Provided data or distance matrix must be in ndarray \"\n                \"or DataFrame.\",\n                UserWarning,\n            )\n            if isinstance(obj, list):\n                points_vector = np.array(obj)\n                return points_vector\n            points_vector = np.array([obj])\n            return points_vector\n\n    def _validate_inputs(self):\n        \"\"\"\n        Validates the inputs provided during initialization to ensure\n        that the needed objects are provided.\n        :return: a tuple of (data, distance_matrix, neighbor_matrix) or\n        raises a warning for invalid inputs.\n        \"\"\"\n        if all(v is None for v in [self.data, self.distance_matrix]):\n            warnings.warn(\n                \"Data or a distance matrix must be provided.\", UserWarning\n            )\n            return False\n        elif all(v is not None for v in [self.data, self.distance_matrix]):\n            warnings.warn(\n                \"Only one of the following may be provided: data or a \"\n                \"distance matrix (not both).\",\n                UserWarning,\n            )\n            return False\n        if self.data is not None:\n            points_vector = self._convert_to_array(self.data)\n            return points_vector, self.distance_matrix, self.neighbor_matrix\n        if all(\n            matrix is not None\n            for matrix in [self.neighbor_matrix, self.distance_matrix]\n        ):\n            dist_vector = self._convert_to_array(self.distance_matrix)\n            neigh_vector = self._convert_to_array(self.neighbor_matrix)\n        else:\n            warnings.warn(\n                \"A neighbor index matrix and distance matrix must both be \"\n                \"provided when not using raw input data.\",\n                UserWarning,\n            )\n            return False\n        if self.distance_matrix.shape != self.neighbor_matrix.shape:\n            warnings.warn(\n                \"The shape of the distance and neighbor \"\n                \"index matrices must match.\",\n                UserWarning,\n            )\n            return False\n        elif (self.distance_matrix.shape[1] != self.n_neighbors) or (\n            self.neighbor_matrix.shape[1] != self.n_neighbors\n        ):\n            warnings.warn(\n                \"The shape of the distance or \"\n                \"neighbor index matrix does not \"\n                \"match the number of neighbors \"\n                \"specified.\",\n                UserWarning,\n            )\n            return False\n        return self.data, dist_vector, neigh_vector\n\n    def _check_cluster_size(self) -> None:\n        \"\"\"\n        Validates the cluster labels to ensure that the smallest cluster\n        size (number of observations in the cluster) is larger than the\n        specified number of neighbors.\n        :raises ClusterSizeError: if any cluster is too small.\n        \"\"\"\n        c_labels = self._cluster_labels()\n        for cluster_id in set(c_labels):\n            c_size = np.where(c_labels == cluster_id)[0].shape[0]\n            if c_size <= self.n_neighbors:\n                raise ClusterSizeError(\n                    \"Number of neighbors specified larger than smallest \"\n                    \"cluster. Specify a number of neighbors smaller than \"\n                    \"the smallest cluster size (observations in smallest \"\n                    \"cluster minus one).\"\n                )\n\n    def _check_n_neighbors(self) -> bool:\n        \"\"\"\n        Validates the specified number of neighbors to ensure that it is\n        greater than 0 and that the specified value is less than the total\n        number of observations.\n        :return: a boolean indicating whether validation has passed without\n        adjustment.\n        \"\"\"\n        if not self.n_neighbors > 0:\n            self.n_neighbors = 10\n            warnings.warn(\n                \"n_neighbors must be greater than 0.\"\n                \" Fit with \" + str(self.n_neighbors) + \" instead.\",\n                UserWarning,\n            )\n            return False\n        elif self.n_neighbors >= self._n_observations():\n            self.n_neighbors = self._n_observations() - 1\n            warnings.warn(\n                \"n_neighbors must be less than the number of observations.\"\n                \" Fit with \" + str(self.n_neighbors) + \" instead.\",\n                UserWarning,\n            )\n        return True\n\n    def _check_extent(self) -> bool:\n        \"\"\"\n        Validates the specified extent parameter to ensure it is either 1,\n        2, or 3.\n        :return: a boolean indicating whether validation has passed.\n        \"\"\"\n        if self.extent not in [1, 2, 3]:\n            warnings.warn(\n                \"extent parameter (lambda) must be 1, 2, or 3.\", UserWarning\n            )\n            return False\n        return True\n\n    def _check_missing_values(self) -> None:\n        \"\"\"\n        Validates the provided data to ensure that it contains no\n        missing values.\n        :raises MissingValuesError: if data contains NaN values.\n        \"\"\"\n        if np.any(np.isnan(self.data)):\n            raise MissingValuesError(\n                \"Method does not support missing values in input data.\"\n            )\n\n    def _check_is_fit(self) -> bool:\n        \"\"\"\n        Checks that the model was fit prior to calling the stream() method.\n        :return: a boolean indicating whether the model has been fit.\n        \"\"\"\n        if self.is_fit is False:\n            warnings.warn(\n                \"Must fit on historical data by calling fit() prior to \"\n                \"calling stream(x).\",\n                UserWarning,\n            )\n            return False\n        return True\n\n    def _check_no_cluster_labels(self) -> bool:\n        \"\"\"\n        Checks to see if cluster labels are attempting to be used in\n        stream() and, if so, returns False. As PyNomaly does not accept\n        clustering algorithms as input, the stream approach does not\n        support clustering.\n        :return: a boolean indicating whether single cluster (no labels).\n        \"\"\"\n        if len(set(self._cluster_labels())) > 1:\n            warnings.warn(\n                \"Stream approach does not support clustered data. \"\n                \"Automatically refit using single cluster of points.\",\n                UserWarning,\n            )\n            return False\n        return True\n\n    \"\"\"\n    Decorators.\n    \"\"\"\n\n    def accepts(*types):\n        \"\"\"\n        A decorator that facilitates a form of type checking for the inputs\n        which can be used in Python 3.4-3.7 in lieu of Python 3.5+'s type\n        hints.\n        :param types: the input types of the objects being passed as arguments\n        in __init__.\n        :return: a decorator.\n        \"\"\"\n\n        def decorator(f):\n            assert len(types) == f.__code__.co_argcount\n\n            def new_f(*args, **kwds):\n                for a, t in zip(args, types):\n                    if type(a).__name__ == \"DataFrame\":\n                        a = np.array(a)\n                    if isinstance(a, t) is False:\n                        warnings.warn(\n                            \"Argument %r is not of type %s\" % (a, t), UserWarning\n                        )\n                opt_types = {\n                    \"distance_matrix\": {\"type\": types[2]},\n                    \"neighbor_matrix\": {\"type\": types[3]},\n                    \"extent\": {\"type\": types[4]},\n                    \"n_neighbors\": {\"type\": types[5]},\n                    \"cluster_labels\": {\"type\": types[6]},\n                    \"use_numba\": {\"type\": types[7]},\n                    \"progress_bar\": {\"type\": types[8]},\n                }\n                for x in kwds:\n                    opt_types[x][\"value\"] = kwds[x]\n                for k in opt_types:\n                    try:\n                        if (\n                            isinstance(opt_types[k][\"value\"], opt_types[k][\"type\"])\n                            is False\n                        ):\n                            warnings.warn(\n                                \"Argument %r is not of type %s.\"\n                                % (k, opt_types[k][\"type\"]),\n                                UserWarning,\n                            )\n                    except KeyError:\n                        pass\n                return f(*args, **kwds)\n\n            new_f.__name__ = f.__name__\n            return new_f\n\n        return decorator\n\n    @accepts(\n        object,\n        np.ndarray,\n        np.ndarray,\n        np.ndarray,\n        (int, np.integer),\n        (int, np.integer),\n        list,\n        bool,\n        bool,\n    )\n    def __init__(\n        self,\n        data=None,\n        distance_matrix=None,\n        neighbor_matrix=None,\n        extent=3,\n        n_neighbors=10,\n        cluster_labels=None,\n        use_numba=False,\n        progress_bar=False,\n    ) -> None:\n        self.data = data\n        self.distance_matrix = distance_matrix\n        self.neighbor_matrix = neighbor_matrix\n        self.extent = extent\n        self.n_neighbors = n_neighbors\n        self.cluster_labels = cluster_labels\n        self.use_numba = use_numba\n        self.points_vector = None\n        self.prob_distances = None\n        self.prob_distances_ev = None\n        self.norm_prob_local_outlier_factor = None\n        self.local_outlier_probabilities = None\n        self._objects = {}\n        self.progress_bar = progress_bar\n        self.is_fit = False\n\n        if self.use_numba is True and \"numba\" not in sys.modules:\n            self.use_numba = False\n            warnings.warn(\n                \"Numba is not available, falling back to pure python mode.\", UserWarning\n            )\n\n        self._validate_inputs()\n        self._check_extent()\n\n    \"\"\"\n    Private methods.\n    \"\"\"\n\n    @staticmethod\n    def _standard_distance(cardinality: float, sum_squared_distance: float) -> float:\n        \"\"\"\n        Calculates the standard distance of an observation.\n        :param cardinality: the cardinality of the input observation.\n        :param sum_squared_distance: the sum squared distance between all\n        neighbors of the input observation.\n        :return: the standard distance.\n        #\"\"\"\n        division_result = sum_squared_distance / cardinality\n        st_dist = sqrt(division_result)\n        return st_dist\n\n    @staticmethod\n    def _prob_distance(extent: int, standard_distance: float) -> float:\n        \"\"\"\n        Calculates the probabilistic distance of an observation.\n        :param extent: the extent value specified during initialization.\n        :param standard_distance: the standard distance of the input\n        observation.\n        :return: the probabilistic distance.\n        \"\"\"\n        return extent * standard_distance\n\n    @staticmethod\n    def _prob_outlier_factor(\n        probabilistic_distance: np.ndarray, ev_prob_dist: np.ndarray\n    ) -> np.ndarray:\n        \"\"\"\n        Calculates the probabilistic outlier factor of an observation.\n        :param probabilistic_distance: the probabilistic distance of the\n        input observation.\n        :param ev_prob_dist:\n        :return: the probabilistic outlier factor.\n        \"\"\"\n        if np.all(probabilistic_distance == ev_prob_dist):\n            return np.zeros(probabilistic_distance.shape)\n        else:\n            ev_prob_dist[ev_prob_dist == 0.0] = 1.0e-8\n            result = np.divide(probabilistic_distance, ev_prob_dist) - 1.0\n            return result\n\n    @staticmethod\n    def _norm_prob_outlier_factor(\n        extent: float, ev_probabilistic_outlier_factor: list\n    ) -> list:\n        \"\"\"\n        Calculates the normalized probabilistic outlier factor of an\n        observation.\n        :param extent: the extent value specified during initialization.\n        :param ev_probabilistic_outlier_factor: the expected probabilistic\n        outlier factor of the input observation.\n        :return: the normalized probabilistic outlier factor.\n        \"\"\"\n        npofs = []\n        for i in ev_probabilistic_outlier_factor:\n            npofs.append(extent * sqrt(i))\n        return npofs\n\n    @staticmethod\n    def _local_outlier_probability(\n        plof_val: np.ndarray, nplof_val: np.ndarray\n    ) -> np.ndarray:\n        \"\"\"\n        Calculates the local outlier probability of an observation.\n        :param plof_val: the probabilistic outlier factor of the input\n        observation.\n        :param nplof_val: the normalized probabilistic outlier factor of the\n        input observation.\n        :return: the local outlier probability.\n        \"\"\"\n        erf_vec = np.vectorize(erf)\n        if np.all(plof_val == nplof_val):\n            return np.zeros(plof_val.shape)\n        else:\n            return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.0))))\n\n    def _n_observations(self) -> int:\n        \"\"\"\n        Calculates the number of observations in the data.\n        :return: the number of observations in the input data.\n        \"\"\"\n        if self.data is not None:\n            return len(self.data)\n        return len(self.distance_matrix)\n\n    def _store(self) -> np.ndarray:\n        \"\"\"\n        Initializes the storage matrix that includes the input value,\n        cluster labels, local outlier probability, etc. for the input data.\n        :return: an empty numpy array of shape [n_observations, 3].\n        \"\"\"\n        return np.empty([self._n_observations(), 3], dtype=object)\n\n    def _cluster_labels(self) -> np.ndarray:\n        \"\"\"\n        Returns a numpy array of cluster labels that corresponds to the\n        input labels or that is an array of all 0 values to indicate all\n        points belong to the same cluster.\n        :return: a numpy array of cluster labels.\n        \"\"\"\n        if self.cluster_labels is None:\n            if self.data is not None:\n                return np.array([0] * len(self.data))\n            return np.array([0] * len(self.distance_matrix))\n        return np.array(self.cluster_labels)\n\n    @staticmethod\n    def _euclidean(vector1: np.ndarray, vector2: np.ndarray) -> np.ndarray:\n        \"\"\"\n        Calculates the euclidean distance between two observations in the\n        input data.\n        :param vector1: a numpy array corresponding to observation 1.\n        :param vector2: a numpy array corresponding to observation 2.\n        :return: the euclidean distance between the two observations.\n        \"\"\"\n        diff = vector1 - vector2\n        return np.dot(diff, diff) ** 0.5\n\n    def _assign_distances(self, data_store: np.ndarray) -> np.ndarray:\n        \"\"\"\n        Takes a distance matrix, produced by _distances or provided through\n        user input, and assigns distances for each observation to the storage\n        matrix, data_store.\n        :param data_store: the storage matrix that collects information on\n        each observation.\n        :return: the updated storage matrix that collects information on\n        each observation.\n        \"\"\"\n        for vec, cluster_id in zip(\n            range(self.distance_matrix.shape[0]), self._cluster_labels()\n        ):\n            data_store[vec][0] = cluster_id\n            data_store[vec][1] = self.distance_matrix[vec]\n            data_store[vec][2] = self.neighbor_matrix[vec]\n        return data_store\n\n    @staticmethod\n    def _compute_distance_and_neighbor_matrix(\n        clust_points_vector: np.ndarray,\n        indices: np.ndarray,\n        distances: np.ndarray,\n        indexes: np.ndarray,\n    ) -> Tuple[np.ndarray, np.ndarray, int]:\n        \"\"\"\n        This helper method provides the heavy lifting for the _distances\n        method and is only intended for use therein. The code has been\n        written so that it can make full use of Numba's jit capabilities if\n        desired.\n        \"\"\"\n        for i in range(clust_points_vector.shape[0]):\n            for j in range(i + 1, clust_points_vector.shape[0]):\n                # Global index of the points\n                global_i = indices[0][i]\n                global_j = indices[0][j]\n\n                # Compute Euclidean distance\n                diff = clust_points_vector[i] - clust_points_vector[j]\n                d = np.dot(diff, diff) ** 0.5\n\n                # Update distance and neighbor index for global_i\n                idx_max = distances[global_i].argmax()\n                if d < distances[global_i][idx_max]:\n                    distances[global_i][idx_max] = d\n                    indexes[global_i][idx_max] = global_j\n\n                # Update distance and neighbor index for global_j\n                idx_max = distances[global_j].argmax()\n                if d < distances[global_j][idx_max]:\n                    distances[global_j][idx_max] = d\n                    indexes[global_j][idx_max] = global_i\n\n            yield distances, indexes, i\n\n    def _distances(self, progress_bar: bool = False) -> None:\n        \"\"\"\n        Provides the distances between each observation and it's closest\n        neighbors. When input data is provided, calculates the euclidean\n        distance between every observation. Otherwise, the user-provided\n        distance matrix is used.\n        :return: the updated storage matrix that collects information on\n        each observation.\n        \"\"\"\n        distances = np.full(\n            [self._n_observations(), self.n_neighbors], 9e10, dtype=float\n        )\n        indexes = np.full([self._n_observations(), self.n_neighbors], 9e10, dtype=float)\n        self.points_vector = self._convert_to_array(self.data)\n        compute = (\n            numba.jit(self._compute_distance_and_neighbor_matrix, cache=True)\n            if self.use_numba\n            else self._compute_distance_and_neighbor_matrix\n        )\n        progress = \"=\"\n        for cluster_id in set(self._cluster_labels()):\n            indices = np.where(self._cluster_labels() == cluster_id)\n            clust_points_vector = np.array(\n                self.points_vector.take(indices, axis=0)[0], dtype=np.float64\n            )\n            # a generator that yields an updated distance matrix on each loop\n            for c in compute(clust_points_vector, indices, distances, indexes):\n                distances, indexes, i = c\n                # update the progress bar\n                if progress_bar is True:\n                    progress = Utils.emit_progress_bar(\n                        progress, i + 1, clust_points_vector.shape[0]\n                    )\n\n        self.distance_matrix = distances\n        self.neighbor_matrix = indexes\n\n    def _ssd(self, data_store: np.ndarray) -> np.ndarray:\n        \"\"\"\n        Calculates the sum squared distance between neighbors for each\n        observation in the input data.\n        :param data_store: the storage matrix that collects information on\n        each observation.\n        :return: the updated storage matrix that collects information on\n        each observation.\n        \"\"\"\n        self.cluster_labels_u = np.unique(data_store[:, 0])\n        ssd_array = np.empty([self._n_observations(), 1])\n        for cluster_id in self.cluster_labels_u:\n            indices = np.where(data_store[:, 0] == cluster_id)\n            cluster_distances = np.take(data_store[:, 1], indices).tolist()\n            ssd = np.power(cluster_distances[0], 2).sum(axis=1)\n            for i, j in zip(indices[0], ssd):\n                ssd_array[i] = j\n        data_store = np.hstack((data_store, ssd_array))\n        return data_store\n\n    def _standard_distances(self, data_store: np.ndarray) -> np.ndarray:\n        \"\"\"\n        Calculated the standard distance for each observation in the input\n        data. First calculates the cardinality and then calculates the standard\n        distance with respect to each observation.\n        :param data_store:\n        :param data_store: the storage matrix that collects information on\n        each observation.\n        :return: the updated storage matrix that collects information on\n        each observation.\n        \"\"\"\n        cardinality = [self.n_neighbors] * self._n_observations()\n        vals = data_store[:, 3].tolist()\n        std_distances = []\n        for c, v in zip(cardinality, vals):\n            std_distances.append(self._standard_distance(c, v))\n        return np.hstack((data_store, np.array([std_distances]).T))\n\n    def _prob_distances(self, data_store: np.ndarray) -> np.ndarray:\n        \"\"\"\n        Calculates the probabilistic distance for each observation in the\n        input data.\n        :param data_store: the storage matrix that collects information on\n        each observation.\n        :return: the updated storage matrix that collects information on\n        each observation.\n        \"\"\"\n        prob_distances = []\n        for i in range(data_store[:, 4].shape[0]):\n            prob_distances.append(self._prob_distance(self.extent, data_store[:, 4][i]))\n        return np.hstack((data_store, np.array([prob_distances]).T))\n\n    def _prob_distances_ev(self, data_store) -> np.ndarray:\n        \"\"\"\n        Calculates the expected value of the probabilistic distance for\n        each observation in the input data with respect to the cluster the\n        observation belongs to.\n        :param data_store: the storage matrix that collects information on\n        each observation.\n        :return: the updated storage matrix that collects information on\n        each observation.\n        \"\"\"\n        prob_set_distance_ev = np.empty([self._n_observations(), 1])\n        for cluster_id in self.cluster_labels_u:\n            indices = np.where(data_store[:, 0] == cluster_id)[0]\n            for index in indices:\n                # Global neighbor indices for the current point\n                nbrhood = data_store[index][2].astype(int)  # Ensure global indices\n                nbrhood_prob_distances = np.take(data_store[:, 5], nbrhood).astype(\n                    float\n                )\n                nbrhood_prob_distances_nonan = nbrhood_prob_distances[\n                    np.logical_not(np.isnan(nbrhood_prob_distances))\n                ]\n                prob_set_distance_ev[index] = nbrhood_prob_distances_nonan.mean()\n\n        self.prob_distances_ev = prob_set_distance_ev\n        return np.hstack((data_store, prob_set_distance_ev))\n\n    def _prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:\n        \"\"\"\n        Calculates the probabilistic local outlier factor for each\n        observation in the input data.\n        :param data_store: the storage matrix that collects information on\n        each observation.\n        :return: the updated storage matrix that collects information on\n        each observation.\n        \"\"\"\n        return np.hstack(\n            (\n                data_store,\n                np.array(\n                    [\n                        np.apply_along_axis(\n                            self._prob_outlier_factor,\n                            0,\n                            data_store[:, 5],\n                            data_store[:, 6],\n                        )\n                    ]\n                ).T,\n            )\n        )\n\n    def _prob_local_outlier_factors_ev(self, data_store: np.ndarray) -> np.ndarray:\n        \"\"\"\n        Calculates the expected value of the probabilistic local outlier factor\n        for each observation in the input data with respect to the cluster the\n        observation belongs to.\n        :param data_store: the storage matrix that collects information on\n        each observation.\n        :return: the updated storage matrix that collects information on\n        each observation.\n        \"\"\"\n        prob_local_outlier_factor_ev_dict = {}\n        for cluster_id in self.cluster_labels_u:\n            indices = np.where(data_store[:, 0] == cluster_id)\n            prob_local_outlier_factors = np.take(data_store[:, 7], indices).astype(\n                float\n            )\n            prob_local_outlier_factors_nonan = prob_local_outlier_factors[\n                np.logical_not(np.isnan(prob_local_outlier_factors))\n            ]\n            prob_local_outlier_factor_ev_dict[cluster_id] = np.power(\n                prob_local_outlier_factors_nonan, 2\n            ).sum() / float(prob_local_outlier_factors_nonan.size)\n        data_store = np.hstack(\n            (\n                data_store,\n                np.array(\n                    [\n                        [\n                            prob_local_outlier_factor_ev_dict[x]\n                            for x in data_store[:, 0].tolist()\n                        ]\n                    ]\n                ).T,\n            )\n        )\n        return data_store\n\n    def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:\n        \"\"\"\n        Calculates the normalized probabilistic local outlier factor for each\n        observation in the input data.\n        :param data_store: the storage matrix that collects information on\n        each observation.\n        :return: the updated storage matrix that collects information on\n        each observation.\n        \"\"\"\n        return np.hstack(\n            (\n                data_store,\n                np.array(\n                    [\n                        self._norm_prob_outlier_factor(\n                            self.extent, data_store[:, 8].tolist()\n                        )\n                    ]\n                ).T,\n            )\n        )\n\n    def _local_outlier_probabilities(self, data_store: np.ndarray) -> np.ndarray:\n        \"\"\"\n        Calculates the local outlier probability for each observation in the\n        input data.\n        :param data_store: the storage matrix that collects information on\n        each observation.\n        :return: the updated storage matrix that collects information on\n        each observation.\n        \"\"\"\n        return np.hstack(\n            (\n                data_store,\n                np.array(\n                    [\n                        np.apply_along_axis(\n                            self._local_outlier_probability,\n                            0,\n                            data_store[:, 7],\n                            data_store[:, 9],\n                        )\n                    ]\n                ).T,\n            )\n        )\n\n    \"\"\"\n    Public methods\n    \"\"\"\n\n    def fit(self) -> \"LocalOutlierProbability\":\n        \"\"\"\n        Calculates the local outlier probability for each observation in the\n        input data according to the input parameters extent, n_neighbors, and\n        cluster_labels.\n        :return: self, which contains the local outlier probabilities as\n        self.local_outlier_probabilities.\n        :raises ClusterSizeError: if any cluster is smaller than n_neighbors.\n        :raises MissingValuesError: if data contains missing values.\n        \"\"\"\n\n        self._check_n_neighbors()\n        self._check_cluster_size()\n        if self.data is not None:\n            self._check_missing_values()\n\n        store = self._store()\n        if self.data is not None:\n            self._distances(progress_bar=self.progress_bar)\n        store = self._assign_distances(store)\n        store = self._ssd(store)\n        store = self._standard_distances(store)\n        store = self._prob_distances(store)\n        self.prob_distances = store[:, 5]\n        store = self._prob_distances_ev(store)\n        store = self._prob_local_outlier_factors(store)\n        store = self._prob_local_outlier_factors_ev(store)\n        store = self._norm_prob_local_outlier_factors(store)\n        self.norm_prob_local_outlier_factor = store[:, 9].max()\n        store = self._local_outlier_probabilities(store)\n        self.local_outlier_probabilities = store[:, 10]\n\n        self.is_fit = True\n\n        return self\n\n    def stream(self, x: np.ndarray) -> np.ndarray:\n        \"\"\"\n        Calculates the local outlier probability for an individual sample\n        according to the input parameters extent, n_neighbors, and\n        cluster_labels after first calling fit(). Observations are assigned\n        a local outlier probability against the mean of expected values of\n        probabilistic distance and the normalized probabilistic outlier\n        factor from the earlier model, provided when calling fit().\n        distance\n        :param x: an observation to score for its local outlier probability.\n        :return: the local outlier probability of the input observation.\n        \"\"\"\n\n        orig_cluster_labels = None\n        if self._check_no_cluster_labels() is False:\n            orig_cluster_labels = self.cluster_labels\n            self.cluster_labels = np.array([0] * len(self.data))\n\n        if self._check_is_fit() is False:\n            self.fit()\n\n        point_vector = self._convert_to_array(x)\n        distances = np.full([1, self.n_neighbors], 9e10, dtype=float)\n        if self.data is not None:\n            matrix = self.points_vector\n        else:\n            matrix = self.distance_matrix\n            # When using distance matrix mode, x is a scalar distance value.\n            # Extract scalar from array to avoid NumPy assignment errors.\n            if point_vector.size == 1:\n                point_vector = float(point_vector.flat[0])\n        for p in range(0, matrix.shape[0]):\n            if self.data is not None:\n                d = self._euclidean(matrix[p, :], point_vector)\n            else:\n                d = point_vector\n            idx_max = distances[0].argmax()\n            if d < distances[0][idx_max]:\n                distances[0][idx_max] = d\n\n        ssd = np.power(distances, 2).sum()\n        std_dist = np.sqrt(np.divide(ssd, self.n_neighbors))\n        prob_dist = self._prob_distance(self.extent, std_dist)\n        plof = self._prob_outlier_factor(\n            np.array(prob_dist), np.array(self.prob_distances_ev.mean())\n        )\n        loop = self._local_outlier_probability(\n            plof, self.norm_prob_local_outlier_factor\n        )\n\n        if orig_cluster_labels is not None:\n            self.cluster_labels = orig_cluster_labels\n\n        return loop\n"
  },
  {
    "path": "README.md",
    "content": "# PyNomaly\n\nPyNomaly is a Python 3 implementation of LoOP (Local Outlier Probabilities).\nLoOP is a local density based outlier detection method by Kriegel, Kröger, Schubert, and Zimek which provides outlier\nscores in the range of [0,1] that are directly interpretable as the probability of a sample being an outlier. \n\nPyNomaly is a core library of [deepchecks](https://github.com/deepchecks/deepchecks), [OmniDocBench](https://github.com/opendatalab/OmniDocBench) and [pysad](https://github.com/selimfirat/pysad). \n\n[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)\n[![PyPi](https://img.shields.io/badge/pypi-0.3.5-blue.svg)](https://pypi.python.org/pypi/PyNomaly/0.3.5)\n[![Total Downloads](https://static.pepy.tech/badge/pynomaly)](https://pepy.tech/projects/pynomaly)\n[![Monthly Downloads](https://static.pepy.tech/badge/pynomaly/month)](https://pepy.tech/projects/pynomaly)\n![Tests](https://github.com/vc1492a/PyNomaly/actions/workflows/tests.yml/badge.svg)\n[![Coverage Status](https://coveralls.io/repos/github/vc1492a/PyNomaly/badge.svg?branch=main)](https://coveralls.io/github/vc1492a/PyNomaly?branch=main)\n[![JOSS](http://joss.theoj.org/papers/f4d2cfe680768526da7c1f6a2c103266/status.svg)](http://joss.theoj.org/papers/f4d2cfe680768526da7c1f6a2c103266)\n\nThe outlier score of each sample is called the Local Outlier Probability.\nIt measures the local deviation of density of a given sample with\nrespect to its neighbors as Local Outlier Factor (LOF), but provides normalized\noutlier scores in the range [0,1]. These outlier scores are directly interpretable\nas a probability of an object being an outlier. Since Local Outlier Probabilities provides scores in the\nrange [0,1], practitioners are free to interpret the results according to the application.\n\nLike LOF, it is local in that the anomaly score depends on how isolated the sample is\nwith respect to the surrounding neighborhood. Locality is given by k-nearest neighbors,\nwhose distance is used to estimate the local density. By comparing the local density of a sample to the\nlocal densities of its neighbors, one can identify samples that lie in regions of lower\ndensity compared to their neighbors and thus identify samples that may be outliers according to their Local\nOutlier Probability.\n\nThe authors' 2009 paper detailing LoOP's theory, formulation, and application is provided by\nLudwig-Maximilians University Munich - Institute for Informatics;\n[LoOP: Local Outlier Probabilities](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LoOP1649.pdf).\n\n## Implementation\n\nThis Python 3 implementation uses Numpy and the formulas outlined in\n[LoOP: Local Outlier Probabilities](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LoOP1649.pdf)\nto calculate the Local Outlier Probability of each sample.\n\n## Dependencies\n- Python 3.8 - 3.13\n- numpy >= 1.16.3\n- python-utils >= 2.3.0\n- (optional) numba >= 0.45.1\n\nNumba just-in-time (JIT) compiles the function with calculates the Euclidean \ndistance between observations, providing a reduction in computation time \n(significantly when a large number of observations are scored). Numba is not a \nrequirement and PyNomaly may still be used solely with numpy if desired\n(details below). \n\n## Quick Start\n\nFirst install the package from the Python Package Index:\n\n```shell\npip install PyNomaly # or pip3 install ... if you're using both Python 3 and 2.\n```\n\nAlternatively, you can use conda to install the package from conda-forge:\n\n```shell\nconda install conda-forge::pynomaly\n```\nThen you can do something like this:\n\n```python\nfrom PyNomaly import loop\nm = loop.LocalOutlierProbability(data).fit()\nscores = m.local_outlier_probabilities\nprint(scores)\n```\nwhere *data* is a NxM (N rows, M columns; 2-dimensional) set of data as either a Pandas DataFrame or Numpy array.\n\nLocalOutlierProbability sets the *extent* (in integer in value of 1, 2, or 3) and *n_neighbors* (must be greater than 0) parameters with the default\nvalues of 3 and 10, respectively. You're free to set these parameters on your own as below:\n\n```python\nfrom PyNomaly import loop\nm = loop.LocalOutlierProbability(data, extent=2, n_neighbors=20).fit()\nscores = m.local_outlier_probabilities\nprint(scores)\n```\n\nThis implementation of LoOP also includes an optional *cluster_labels* parameter. This is useful in cases where regions\nof varying density occur within the same set of data. When using *cluster_labels*, the Local Outlier Probability of a\nsample is calculated with respect to its cluster assignment.\n\n```python\nfrom PyNomaly import loop\nfrom sklearn.cluster import DBSCAN\ndb = DBSCAN(eps=0.6, min_samples=50).fit(data)\nm = loop.LocalOutlierProbability(data, extent=2, n_neighbors=20, cluster_labels=list(db.labels_)).fit()\nscores = m.local_outlier_probabilities\nprint(scores)\n```\n\n**NOTE**: Unless your data is all the same scale, it may be a good idea to normalize your data with z-scores or another\nnormalization scheme prior to using LoOP, especially when working with multiple dimensions of varying scale.\nUsers must also appropriately handle missing values prior to using LoOP, as LoOP does not support Pandas\nDataFrames or Numpy arrays with missing values.\n\n### Utilizing Numba and Progress Bars\n\nIt may be helpful to use just-in-time (JIT) compilation in the cases where a lot of \nobservations are scored. Numba, a JIT compiler for Python, may be used \nwith PyNomaly by setting `use_numba=True`:\n\n```python\nfrom PyNomaly import loop\nm = loop.LocalOutlierProbability(data, extent=2, n_neighbors=20, use_numba=True, progress_bar=True).fit()\nscores = m.local_outlier_probabilities\nprint(scores)\n```\n\nNumba must be installed if the above to use JIT compilation and improve the \nspeed of multiple calls to `LocalOutlierProbability()`, and PyNomaly has been \ntested with Numba version 0.45.1. An example of the speed difference that can \nbe realized with using Numba is avaialble in `examples/numba_speed_diff.py`. \n\nYou may also choose to print progress bars _with our without_ the use of numba \nby passing `progress_bar=True` to the `LocalOutlierProbability()` method as above.\n\n### Choosing Parameters\n\nThe *extent* parameter controls the sensitivity of the scoring in practice. The parameter corresponds to\nthe statistical notion of an outlier defined as an object deviating more than a given lambda (*extent*)\ntimes the standard deviation from the mean. A value of 2 implies outliers deviating more than 2 standard deviations\nfrom the mean, and corresponds to 95.0% in the empirical \"three-sigma\" rule. The appropriate parameter should be selected\naccording to the level of sensitivity needed for the input data and application. The question to ask is whether it is\nmore reasonable to assume outliers in your data are 1, 2, or 3 standard deviations from the mean, and select the value\nlikely most appropriate to your data and application.\n\nThe *n_neighbors* parameter defines the number of neighbors to consider about\neach sample (neighborhood size) when determining its Local Outlier Probability with respect to the density\nof the sample's defined neighborhood. The idea number of neighbors to consider is dependent on the\ninput data. However, the notion of an outlier implies it would be considered as such regardless of the number\nof neighbors considered. One potential approach is to use a number of different neighborhood sizes and average\nthe results for reach observation. Those observations which rank highly with varying neighborhood sizes are\nmore than likely outliers. This is one potential approach of selecting the neighborhood size. Another is to\nselect a value proportional to the number of observations, such an odd-valued integer close to the square root\nof the number of observations in your data (*sqrt(n_observations*).\n\n## Iris Data Example\n\nWe'll be using the well-known Iris dataset to show LoOP's capabilities. There's a few things you'll need for this\nexample beyond the standard prerequisites listed above:\n- matplotlib 2.0.0 or greater\n- PyDataset 0.2.0 or greater\n- scikit-learn 0.18.1 or greater\n\nFirst, let's import the packages and libraries we will need for this example.\n\n```python\nfrom PyNomaly import loop\nimport pandas as pd\nfrom pydataset import data\nimport numpy as np\nfrom sklearn.cluster import DBSCAN\nimport matplotlib.pyplot as plt\nfrom mpl_toolkits.mplot3d import Axes3D\n```\n\nNow let's create two sets of Iris data for scoring; one with clustering and the other without.\n\n```python\n# import the data and remove any non-numeric columns\niris = pd.DataFrame(data('iris').drop(columns=['Species']))\n```\n\nNext, let's cluster the data using DBSCAN and generate two sets of scores. On both cases, we will use the default\nvalues for both *extent* (0.997) and *n_neighbors* (10).\n\n```python\ndb = DBSCAN(eps=0.9, min_samples=10).fit(iris)\nm = loop.LocalOutlierProbability(iris).fit()\nscores_noclust = m.local_outlier_probabilities\nm_clust = loop.LocalOutlierProbability(iris, cluster_labels=list(db.labels_)).fit()\nscores_clust = m_clust.local_outlier_probabilities\n```\n\nOrganize the data into two separate Pandas DataFrames.\n\n```python\niris_clust = pd.DataFrame(iris.copy())\niris_clust['scores'] = scores_clust\niris_clust['labels'] = db.labels_\niris['scores'] = scores_noclust\n```\n\nAnd finally, let's visualize the scores provided by LoOP in both cases (with and without clustering).\n\n```python\nfig = plt.figure(figsize=(7, 7))\nax = fig.add_subplot(111, projection='3d')\nax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],\nc=iris['scores'], cmap='seismic', s=50)\nax.set_xlabel('Sepal.Width')\nax.set_ylabel('Petal.Width')\nax.set_zlabel('Sepal.Length')\nplt.show()\nplt.clf()\nplt.cla()\nplt.close()\n\nfig = plt.figure(figsize=(7, 7))\nax = fig.add_subplot(111, projection='3d')\nax.scatter(iris_clust['Sepal.Width'], iris_clust['Petal.Width'], iris_clust['Sepal.Length'],\nc=iris_clust['scores'], cmap='seismic', s=50)\nax.set_xlabel('Sepal.Width')\nax.set_ylabel('Petal.Width')\nax.set_zlabel('Sepal.Length')\nplt.show()\nplt.clf()\nplt.cla()\nplt.close()\n\nfig = plt.figure(figsize=(7, 7))\nax = fig.add_subplot(111, projection='3d')\nax.scatter(iris_clust['Sepal.Width'], iris_clust['Petal.Width'], iris_clust['Sepal.Length'],\nc=iris_clust['labels'], cmap='Set1', s=50)\nax.set_xlabel('Sepal.Width')\nax.set_ylabel('Petal.Width')\nax.set_zlabel('Sepal.Length')\nplt.show()\nplt.clf()\nplt.cla()\nplt.close()\n```\n\nYour results should look like the following:\n\n**LoOP Scores without Clustering**\n![LoOP Scores without Clustering](https://github.com/vc1492a/PyNomaly/blob/main/images/scores.png)\n\n**LoOP Scores with Clustering**\n![LoOP Scores with Clustering](https://github.com/vc1492a/PyNomaly/blob/main/images/scores_clust.png)\n\n**DBSCAN Cluster Assignments**\n![DBSCAN Cluster Assignments](https://github.com/vc1492a/PyNomaly/blob/main/images/cluster_assignments.png)\n\n\nNote the differences between using LocalOutlierProbability with and without clustering. In the example without clustering, samples are\nscored according to the distribution of the entire data set. In the example with clustering, each sample is scored\naccording to the distribution of each cluster. Which approach is suitable depends on the use case.\n\n**NOTE**: Data was not normalized in this example, but it's probably a good idea to do so in practice.\n\n## Using Numpy\n\nWhen using numpy, make sure to use 2-dimensional arrays in tabular format:\n\n```python\ndata = np.array([\n    [43.3, 30.2, 90.2],\n    [62.9, 58.3, 49.3],\n    [55.2, 56.2, 134.2],\n    [48.6, 80.3, 50.3],\n    [67.1, 60.0, 55.9],\n    [421.5, 90.3, 50.0]\n])\n\nscores = loop.LocalOutlierProbability(data, n_neighbors=3).fit().local_outlier_probabilities\nprint(scores)\n\n```\n\nThe shape of the input array shape corresponds to the rows (observations) and columns (features) in the data:\n\n```python\nprint(data.shape)\n# (6,3), which matches number of observations and features in the above example\n```\n\nSimilar to the above:\n\n```python\ndata = np.random.rand(100, 5)\nscores = loop.LocalOutlierProbability(data).fit().local_outlier_probabilities\nprint(scores)\n```\n\n## Specifying a Distance Matrix\n\nPyNomaly provides the ability to specify a distance matrix so that any\ndistance metric can be used (a neighbor index matrix must also be provided).\nThis can be useful when wanting to use a distance other than the euclidean.\n\nNote that in order to maintain alignment with the LoOP definition of closest neighbors, \nan additional neighbor is added when using [scikit-learn's NearestNeighbors](https://scikit-learn.org/1.5/modules/neighbors.html) since `NearestNeighbors` \nincludes the point itself when calculating the cloest neighbors (whereas the LoOP method does not include distances to point itself). \n\n```python\nimport numpy as np\nfrom sklearn.neighbors import NearestNeighbors\n\ndata = np.array([\n    [43.3, 30.2, 90.2],\n    [62.9, 58.3, 49.3],\n    [55.2, 56.2, 134.2],\n    [48.6, 80.3, 50.3],\n    [67.1, 60.0, 55.9],\n    [421.5, 90.3, 50.0]\n])\n\n# Generate distance and neighbor matrices\nn_neighbors = 3 # the number of neighbors according to the LoOP definition \nneigh = NearestNeighbors(n_neighbors=n_neighbors+1, metric='hamming')\nneigh.fit(data)\nd, idx = neigh.kneighbors(data, return_distance=True)\n\n# Remove self-distances - you MUST do this to preserve the same results as intended by the definition of LoOP\nindices = np.delete(indices, 0, 1)\ndistances = np.delete(distances, 0, 1)\n\n# Fit and return scores\nm = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx, n_neighbors=n_neighbors+1).fit()\nscores = m.local_outlier_probabilities\n```\n\nThe below visualization shows the results by a few known distance metrics:\n\n**LoOP Scores by Distance Metric**\n![DBSCAN Cluster Assignments](https://github.com/vc1492a/PyNomaly/blob/main/images/scores_by_distance_metric.png)\n\n## Streaming Data\n\nPyNomaly also contains an implementation of Hamlet et. al.'s modifications\nto the original LoOP approach [[4](http://www.tandfonline.com/doi/abs/10.1080/23742917.2016.1226651?journalCode=tsec20)],\nwhich may be used for applications involving streaming data or where rapid calculations may be necessary.\nFirst, the standard LoOP algorithm is used on \"training\" data, with certain attributes of the fitted data\nstored from the original LoOP approach. Then, as new points are considered, these fitted attributes are\ncalled when calculating the score of the incoming streaming data due to the use of averages from the initial\nfit, such as the use of a global value for the expected value of the probabilistic distance. Despite the potential\nfor increased error when compared to the standard approach, it may be effective in streaming applications where\nrefitting the standard approach over all points could be computationally expensive.\n\nWhile the iris dataset is not streaming data, we'll use it in this example by taking the first 120 observations\nas training data and take the remaining 30 observations as a stream, scoring each observation\nindividually.\n\nSplit the data.\n```python\niris = iris.sample(frac=1) # shuffle data\niris_train = iris.iloc[:, 0:4].head(120)\niris_test = iris.iloc[:, 0:4].tail(30)\n```\n\nFit to each set.\n```python\nm = loop.LocalOutlierProbability(iris).fit()\nscores_noclust = m.local_outlier_probabilities\niris['scores'] = scores_noclust\n\nm_train = loop.LocalOutlierProbability(iris_train, n_neighbors=10)\nm_train.fit()\niris_train_scores = m_train.local_outlier_probabilities\n```\n\n```python\niris_test_scores = []\nfor index, row in iris_test.iterrows():\n    array = np.array([row['Sepal.Length'], row['Sepal.Width'], row['Petal.Length'], row['Petal.Width']])\n    iris_test_scores.append(m_train.stream(array))\niris_test_scores = np.array(iris_test_scores)\n```\n\nConcatenate the scores and assess.\n\n```python\niris['stream_scores'] = np.hstack((iris_train_scores, iris_test_scores))\n# iris['scores'] from earlier example\nrmse = np.sqrt(((iris['scores'] - iris['stream_scores']) ** 2).mean(axis=None))\nprint(rmse)\n```\n\nThe root mean squared error (RMSE) between the two approaches is approximately 0.199 (your scores will vary depending on the data and specification).\nThe plot below shows the scores from the stream approach.\n\n```python\nfig = plt.figure(figsize=(7, 7))\nax = fig.add_subplot(111, projection='3d')\nax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],\nc=iris['stream_scores'], cmap='seismic', s=50)\nax.set_xlabel('Sepal.Width')\nax.set_ylabel('Petal.Width')\nax.set_zlabel('Sepal.Length')\nplt.show()\nplt.clf()\nplt.cla()\nplt.close()\n```\n\n**LoOP Scores using Stream Approach with n=10**\n![LoOP Scores using Stream Approach with n=10](https://github.com/vc1492a/PyNomaly/blob/main/images/scores_stream.png)\n\n### Notes\nWhen calculating the LoOP score of incoming data, the original fitted scores are not updated.\nIn some applications, it may be beneficial to refit the data periodically. The stream functionality\nalso assumes that either data or a distance matrix (or value) will be used across in both fitting\nand streaming, with no changes in specification between steps.\n\n## Contributing\n\nPlease use the issue tracker to report any erroneous behavior or desired \nfeature requests. \n\nIf you would like to contribute to development, please fork the repository and make \nany changes to a branch which corresponds to an open issue. Hot fixes \nand bug fixes can be represented by branches with the prefix `fix/` versus \n`feature/` for new capabilities or code improvements. Pull requests will \nthen be made from these branches into the repository's `dev` branch \nprior to being pulled into `main`. \n\n### Commit Messages and Releases\n\n**Your commit messages are important** - here's why. \n\nPyNomaly leverages [release-please](https://github.com/googleapis/release-please-action) to help automate the release process using the [Conventional Commits](https://www.conventionalcommits.org/) specification. When pull requests are opened to the `main` branch, release-please will collate the git commit messages and prepare an organized changelog and release notes. This process can be completed because of the Conventional Commits specification. \n\nConventional Commits provides an easy set of rules for creating an explicit commit history; which makes it easier to write automated tools on top of. This convention dovetails with SemVer, by describing the features, fixes, and breaking changes made in commit messages. You can check out examples [here](https://www.conventionalcommits.org/en/v1.0.0/#examples). Make a best effort to use the specification when contributing to Infactory code as it dramatically eases the documentation around releases and their features, breaking changes, bug fixes and documentation updates. \n\n### Tests\nWhen contributing, please ensure to run unit tests and add additional tests as \nnecessary if adding new functionality. To run the unit tests, use `pytest`: \n\n```\npython3 -m pytest --cov=PyNomaly -s -v\n```\n\nTo run the tests with Numba enabled, simply set the flag `NUMBA` in `test_loop.py` \nto `True`. Note that a drop in coverage is expected due to portions of the code \nbeing compiled upon code execution. \n\n## Versioning\n[Semantic versioning](http://semver.org/) is used for this project. If contributing, please conform to semantic\nversioning guidelines when submitting a pull request.\n\n## License\nThis project is licensed under the Apache 2.0 license.\n\n## Research\nIf citing PyNomaly, use the following: \n\n```\n@article{Constantinou2018,\n  doi = {10.21105/joss.00845},\n  url = {https://doi.org/10.21105/joss.00845},\n  year  = {2018},\n  month = {oct},\n  publisher = {The Open Journal},\n  volume = {3},\n  number = {30},\n  pages = {845},\n  author = {Valentino Constantinou},\n  title = {{PyNomaly}: Anomaly detection using Local Outlier Probabilities ({LoOP}).},\n  journal = {Journal of Open Source Software}\n}\n```\n\n\n## References\n1. Breunig M., Kriegel H.-P., Ng R., Sander, J. LOF: Identifying Density-based Local Outliers. ACM SIGMOD International Conference on Management of Data (2000). [PDF](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LOF.pdf).\n2. Kriegel H., Kröger P., Schubert E., Zimek A. LoOP: Local Outlier Probabilities. 18th ACM conference on Information and knowledge management, CIKM (2009). [PDF](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LoOP1649.pdf).\n3. Goldstein M., Uchida S. A Comparative Evaluation of Unsupervised Anomaly Detection Algorithms for Multivariate Data. PLoS ONE 11(4): e0152173 (2016).\n4. Hamlet C., Straub J., Russell M., Kerlin S. An incremental and approximate local outlier probability algorithm for intrusion detection and its evaluation. Journal of Cyber Security Technology (2016). [DOI](http://www.tandfonline.com/doi/abs/10.1080/23742917.2016.1226651?journalCode=tsec20).\n\n## Acknowledgements\n- The authors of LoOP (Local Outlier Probabilities)\n    - Hans-Peter Kriegel\n    - Peer Kröger\n    - Erich Schubert\n    - Arthur Zimek\n- [NASA Jet Propulsion Laboratory](https://jpl.nasa.gov/)\n    - [Kyle Hundman](https://github.com/khundman)\n    - [Ian Colwell](https://github.com/iancolwell)\n"
  },
  {
    "path": "changelog.md",
    "content": "# Changelog\nAll notable changes to PyNomaly will be documented in this Changelog.\n\nThe format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) \nand adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).\n\n## 0.3.5\n### Changed\n- Refactored the `Validate` class by dissolving it and moving validation methods \ndirectly into `LocalOutlierProbability` as instance methods \n([Issue #69](https://github.com/vc1492a/PyNomaly/issues/69)).\n- Renamed validation methods for clarity: `_fit()` → `_check_is_fit()`, \n`_data()` → `_convert_to_array()`, `_inputs()` → `_validate_inputs()`, \n`_cluster_size()` → `_check_cluster_size()`, `_n_neighbors()` → `_check_n_neighbors()`, \n`_extent()` → `_check_extent()`, `_missing_values()` → `_check_missing_values()`, \n`_no_cluster_labels()` → `_check_no_cluster_labels()`.\n- Replaced `sys.exit()` calls with proper exception handling. The library no longer \nterminates the Python process on validation errors.\n### Added\n- Custom exception classes for better error handling: `PyNomalyError` (base), \n`ValidationError`, `ClusterSizeError`, and `MissingValuesError`. These are now \nexported from the package and can be caught by users.\n### Fixed\n- Fixed a compatibility issue with NumPy in Python 3.11+ where assigning an array \nto a scalar position in `stream()` would raise a `ValueError` when using distance \nmatrix mode.\n\n## 0.3.4 \n### Changed \n- Changed source code as necessary to address a [user-reported issue](https://github.com/vc1492a/PyNomaly/issues/49), corrected in [this commit](https://github.com/vc1492a/PyNomaly/commit/bbdd12a318316ca9c7e0272a5b06909f3fc4f9b0)\n\n## 0.3.3\n### Changed\n- The implementation of the progress bar to support use when the number of \nobservations is less than the width of the Python console in which the code \nis being executed (tracked in [this issue](https://github.com/vc1492a/PyNomaly/issues/35)).\n### Added\n- Docstring to the testing functions to provide some additional documentation \nof the testing (tracked in [this issue](https://github.com/vc1492a/PyNomaly/issues/41)).\n\n## 0.3.2\n### Changed\n- Removed numba as a strict dependency, which is now an optional dependency \nthat is not needed to use PyNomaly but which provides performance enhancements \nwhen functions are called repeatedly, such as when the number of observations \nis large. This relaxes the numba requirement introduced in version 0.3.0. \n### Added\n- Added progress bar functionality that can be called using \n`LocalOutlierProbability(progress_bar=True)` in both native \nPython and numba just-in-time (JIT) compiled modes. \nThis is helpful in cases where PyNomaly is processing a large amount \nof observations.  \n\n\n## 0.3.1\n### Changed\n- Removed Numba JIT compilation from the `_standard_distance` and \n`_prob_distance` calculations. Using Numba JIT compilation there does \nnot result in a speed improvement and only add compilation overhead.\n- Integrated [pull request #33](https://github.com/vc1492a/PyNomaly/pull/33) \nwhich decreases runtime about 30 to more than 90 percent in some cases, in \nparticular on repeated calls with larger datasets. \n### Added\n- Type hinting for unit tests in `tests/test_loop.py`.\n\n## 0.3.0\n### Changed\n- The manner in which the standard distance is calculated from list \ncomprehension to a vectorized Numpy implementation, reducing compute \ntime for that specific calculation by approximately 75%. \n- Removed formal testing and support for Python 3.4 \n([Python 3 adoption rates](https://rushter.com/blog/python-3-adoption/)).\n- Raised the minimum numpy version requirement from 1.12.0 to 1.16.3.\n### Added \n- Numba just in time (JIT) compilation to improve the speed of some \nof the core functionality, consistently achieving a further 20% reduction \nin compute time when _n_ = 1000. Future optimizations could yield \nfurther reductions in computation time. For now, requiring a strict numba version of `0.43.1` \nin anticipation of [this deprecation](http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-reflection-for-list-and-set-types) - \nwhich does not yet have an implemented solution. \n\n## 0.2.7\n### Changed\n- Integrated various performance enhancements as described in \n[pull request #30](https://github.com/vc1492a/PyNomaly/pull/30) that \nincrease PyNomaly's performance by at least up to 50% in some cases.\n- The Validate classes functions from public to private, as they are only \nused in validating specification and data input into PyNomaly.\n### Added\n- [Issue #27](https://github.com/vc1492a/PyNomaly/issues/27) - Added \ndocstring to key functions in PyNomaly to ease future development and \nprovide additional information.\n- Additional unit tests to raise code coverage from 96% to 100%.\n\n## 0.2.6\n### Fixed\n- [Issue #25](https://github.com/vc1492a/PyNomaly/issues/25) - Fixed an issue\nthat caused zero division errors when all the values in a neighborhood are\nduplicate samples.\n### Changed\n- The error behavior when attempting to use the stream approach\nbefore calling `fit`. While the previous implementation resulted in a\nwarning and system exit, PyNomaly now attempts to `fit` (assumes data or a\ndistance matrix is available) and then later calls `stream`. If no\ndata or distance matrix is provided, a warning is raised.\n### Added\n- [Issue #24](https://github.com/vc1492a/PyNomaly/issues/24) - Added\nthe ability to use one's own distance matrix,\nprovided a neighbor index matrix is also provided. This ensures\nPyNomaly can be used with distances other than the euclidean.\nSee the file `iris_dist_grid.py` for examples.\n- [Issue #23](https://github.com/vc1492a/PyNomaly/issues/23) - Added\nPython 3.7 to the tested distributions in Travis CI and passed tests.\n- Unit tests to monitor the issues and features covered\nin issues [24](https://github.com/vc1492a/PyNomaly/issues/24) and\n[25](https://github.com/vc1492a/PyNomaly/issues/25).\n\n\n## 0.2.5\n### Fixed\n- [Issue #20](https://github.com/vc1492a/PyNomaly/issues/20) - Fixed\na bug that inadvertently used global means of the probabilistic distance\nas the expected value of the probabilistic distance, as opposed to the\nexpected value of the probabilistic distance within a neighborhood of\na point.\n- Integrated [pull request #21](https://github.com/vc1492a/PyNomaly/pull/21) -\nThis pull request addressed the issue noted above.\n### Changed\n- Changed the default behavior to strictly not supporting the\nuse of missing values in the input data, as opposed to the soft enforcement\n(a simple user warning) used in the previous behavior.\n\n## 0.2.4\n### Fixed\n- [Issue #17](https://github.com/vc1492a/PyNomaly/issues/17) - Fixed\na bug that allowed for a column of empty values in the primary data store.\n- Integrated [pull request #18](https://github.com/vc1492a/PyNomaly/pull/18) -\nFixed a bug that was not causing dependencies such as numpy to skip\ninstallation when installing PyNomaly via pip.\n\n## 0.2.3\n### Fixed\n- [Issue #14](https://github.com/vc1492a/PyNomaly/issues/14) - Fixed an issue\nthat was causing a ZeroDivisionError when the specified neighborhood size\nis larger than the total number of observations in the smallest cluster.\n\n## 0.2.2\n### Changed\n- This implementation to align more closely with the specification of the\napproach in the original paper. The extent parameter now takes an integer\nvalue of 1, 2, or 3 that corresponds to the lambda parameter specified\nin the paper. See the [readme](https://github.com/vc1492a/PyNomaly/blob/master/readme.md) for more details.\n- Refactored the code base and created the Validate class, which includes\nchecks for data type, correct specification, and other dependencies.\n### Added\n- Automated tests to ensure the desired functionality is being met can now be\nfound in the `PyNomaly/tests` directory.\n- Code for the examples in the readme can now be found in the `examples` directory.\n- Additional information for parameter selection in the [readme](https://github.com/vc1492a/PyNomaly/blob/master/readme.md).\n\n## 0.2.1\n### Fixed\n- [Issue #10](https://github.com/vc1492a/PyNomaly/issues/10) - Fixed error on line\n142 which was causing the class to fail. More explicit examples\nwere also included in the readme for using numpy arrays.\n\n### Added\n- An improvement to the Euclidean distance calculation by [MichaelSchreier](https://github.com/MichaelSchreier)\nwhich brings a over a 50% reduction in computation time.\n\n## 0.2.0\n### Added\n- Added new functionality to PyNomaly by integrating a modified LoOP\napproach introduced by Hamlet et al. which can be used for streaming\ndata applications or in the case where computational expense is a concern.\nData is first fit to a \"training set\", with any additional observations\nconsidered for outlierness against this initial set.\n\n## 0.1.8\n### Fixed\n- Fixed an issue which allowed the number of neighbors considered to exceed the number of observations. Added a check\nto ensure this is no longer possible.\n\n## 0.1.7\n### Fixed\n- Fixed an issue inadvertently introduced in 0.1.6 that caused distance calculations to be incorrect, \nthus resulting in incorrect LoOP values.  \n\n## 0.1.6\n### Fixed\n- Updated the distance calculation such that the euclidean distance calculation has been separated from \nthe main distance calculation function.\n- Fixed an error in the calculation of the standard distance. \n\n### Changed\n- .fit() now returns a fitted object instead of local_outlier_probabilities. Local outlier probabilities can \nbe now be retrieved by calling .local_outlier_probabilities. See the readme for an example. \n- Some private functions have been renamed. \n\n## 0.1.5\n### Fixed\n- [Issue #4](https://github.com/vc1492a/PyNomaly/issues/4) - Separated parameter type checks \nfrom checks for invalid parameter values.\n    - @accepts decorator verifies LocalOutlierProbability parameters are of correct type.\n    - Parameter value checks moved from .fit() to init.\n- Fixed parameter check to ensure extent value is in the range (0., 1.] instead of [0, 1] (extent cannot be zero). \n- [Issue #1](https://github.com/vc1492a/PyNomaly/issues/1) -  Added type check using @accepts decorator for cluster_labels.    \n\n## 0.1.4\n### Fixed\n- [Issue #3](https://github.com/vc1492a/PyNomaly/issues/3) - .fit() fails if the sum of squared distances sums to 0.\n    - Added check to ensure the sum of square distances is greater than zero.\n    - Added UserWarning to increase the neighborhood size if all neighbors in n_neighbors are \n    zero distance from an observation. \n- Added UserWarning to check for integer type n_neighbor conditions versus float type.\n- Changed calculation of the probabilistic local outlier factor expected value to Numpy operation\n    from base Python. \n    \n## 0.1.3\n### Fixed\n- Altered the distance matrix computation to return a triangular matrix instead of a \nfully populated matrix. This was made to ensure no duplicate neighbors were present \nin computing the neighborhood distance for each observation. \n\n## 0.1.2\n### Added\n- LICENSE.txt file of Apache License, Version 2.0.\n- setup.py, setup.cfg files configured for release to PyPi.\n- Changed name throughout code base from PyLoOP to PyNomaly.\n\n### Other\n- Initial release to PyPi.\n\n## 0.1.1\n### Other\n- A bad push to PyPi necessitated the need to skip a version number. \n    - Chosen name of PyLoOP not present on test index but present on production PyPi index. \n    - Issue not known until push was made to the test index.\n    - Skipped version number to align test and production PyPi indices.\n\n## 0.1.0 - 2017-05-19\n### Added\n- readme.md file documenting methodology, package dependencies, use cases, \nhow to contribute, and acknowledgements.\n- Initial open release of PyNomaly codebase on Github. \n"
  },
  {
    "path": "examples/iris.py",
    "content": "from PyNomaly import loop\nimport pandas as pd\nfrom pydataset import data\nfrom sklearn.cluster import DBSCAN\nimport matplotlib.pyplot as plt\nfrom mpl_toolkits.mplot3d import Axes3D\n\n\niris = pd.DataFrame(data('iris'))\niris = pd.DataFrame(iris.drop('Species', 1))\n\n\ndb = DBSCAN(eps=0.9, min_samples=10).fit(iris)\nm = loop.LocalOutlierProbability(iris).fit()\nscores_noclust = m.local_outlier_probabilities\nm_clust = loop.LocalOutlierProbability(iris, cluster_labels=list(db.labels_)).fit()\nscores_clust = m_clust.local_outlier_probabilities\n\n\niris_clust = pd.DataFrame(iris.copy())\niris_clust['scores'] = scores_clust\niris_clust['labels'] = db.labels_\n\niris['scores'] = scores_noclust\n\n\nfig = plt.figure(figsize=(7, 7))\nax = fig.add_subplot(111, projection='3d')\nax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],\nc=iris['scores'], cmap='seismic', s=50)\nax.set_xlabel('Sepal.Width')\nax.set_ylabel('Petal.Width')\nax.set_zlabel('Sepal.Length')\nplt.show()\nplt.clf()\nplt.cla()\nplt.close()\n\nfig = plt.figure(figsize=(7, 7))\nax = fig.add_subplot(111, projection='3d')\nax.scatter(iris_clust['Sepal.Width'], iris_clust['Petal.Width'], iris_clust['Sepal.Length'],\nc=iris_clust['scores'], cmap='seismic', s=50)\nax.set_xlabel('Sepal.Width')\nax.set_ylabel('Petal.Width')\nax.set_zlabel('Sepal.Length')\nplt.show()\nplt.clf()\nplt.cla()\nplt.close()\n\nfig = plt.figure(figsize=(7, 7))\nax = fig.add_subplot(111, projection='3d')\nax.scatter(iris_clust['Sepal.Width'], iris_clust['Petal.Width'], iris_clust['Sepal.Length'],\nc=iris_clust['labels'], cmap='Set1', s=50)\nax.set_xlabel('Sepal.Width')\nax.set_ylabel('Petal.Width')\nax.set_zlabel('Sepal.Length')\nplt.show()\nplt.clf()\nplt.cla()\nplt.close()\n"
  },
  {
    "path": "examples/iris_dist_grid.py",
    "content": "from PyNomaly import loop\nimport pandas as pd\nfrom pydataset import data\nfrom sklearn.neighbors import NearestNeighbors\nimport matplotlib.pyplot as plt\nfrom mpl_toolkits.mplot3d import Axes3D\n\n\niris = pd.DataFrame(data('iris'))\niris = pd.DataFrame(iris.drop('Species', 1))\n\ndistance_metrics = [\n    'braycurtis',\n    'canberra',\n    'cityblock',\n    'chebyshev',\n    'cosine',\n    'euclidean',\n    'hamming',\n    'l1',\n    'manhattan'\n]\n\nfig = plt.figure(figsize=(17, 17))\n\nfor i in range(1, 10):\n\n    neigh = NearestNeighbors(n_neighbors=10, metric=distance_metrics[i-1])\n    neigh.fit(iris)\n    d, idx = neigh.kneighbors(iris, return_distance=True)\n\n    m = loop.LocalOutlierProbability(distance_matrix=d,\n                                     neighbor_matrix=idx).fit()\n    iris['scores'] = m.local_outlier_probabilities\n\n    ax = fig.add_subplot(3, 3, i, projection='3d')\n    plt.title(distance_metrics[i-1], loc='left', fontsize=18)\n    ax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],\n               c=iris['scores'], cmap='seismic', s=50)\n    ax.set_xlabel('Sepal.Width')\n    ax.set_ylabel('Petal.Width')\n    ax.set_zlabel('Sepal.Length')\n\n\nplt.show()\nplt.clf()\nplt.cla()\nplt.close()\n\n"
  },
  {
    "path": "examples/multiple_gaussian_2d.py",
    "content": "import numpy as np\nimport matplotlib.pyplot as plt\nfrom PyNomaly import loop\nimport pandas as pd\n\n# import the multiple gaussian data #\ndf = pd.read_csv('../data/multiple-gaussian-2d-data-only.csv')\nprint(df)\n\n# fit LoOP according to the original settings outlined in the paper #\nm = loop.LocalOutlierProbability(df[['x', 'y']], n_neighbors=20, extent=3).fit()\nscores = m.local_outlier_probabilities\nprint(scores)\n\n# plot the results #\n# base 3 width, then set as multiple\nthreshold = 0.1\ncolor = np.where(scores > threshold, \"white\", \"black\")\nlabel_mask = np.where(scores > threshold)\narea = (20 * scores) ** 2\nplt.scatter(df['x'], df['y'], c=color, s=area.astype(float), edgecolor='red', linewidth=1)\nplt.scatter(df['x'], df['y'], c='black', s=3)\nfor i in range(len(scores)):\n    if scores[i] > threshold:\n        plt.text(df['x'].loc[i] * (1 + 0.01), df['y'].loc[i] * (1 + 0.01), round(scores[i], 2), fontsize=8)\n\nplt.show()\n\n"
  },
  {
    "path": "examples/numba_speed_diff.py",
    "content": "import numpy as np\nfrom PyNomaly import loop\nimport time\n\n# generate a large set of data\ndata = np.ones(shape=(10000, 4))\n\n# first time the process without Numba\n# use the progress bar to track progress\n\nt1 = time.time()\nscores_numpy = loop.LocalOutlierProbability(\n    data,\n    n_neighbors=3,\n    use_numba=False,\n    progress_bar=True\n).fit().local_outlier_probabilities\nt2 = time.time()\nseconds_no_numba = t2 - t1\nprint(\"\\nComputation took \" + str(seconds_no_numba) + \" seconds without Numba JIT.\")\n\nt3 = time.time()\nscores_numba = loop.LocalOutlierProbability(\n    data,\n    n_neighbors=3,\n    use_numba=True,\n    progress_bar=True\n).fit().local_outlier_probabilities\nt4 = time.time()\nseconds_numba = t4 - t3\nprint(\"\\nComputation took \" + str(seconds_numba) + \" seconds with Numba JIT.\")\n"
  },
  {
    "path": "examples/numpy_array.py",
    "content": ""
  },
  {
    "path": "examples/stream.py",
    "content": "import numpy as np\nfrom PyNomaly import loop\nimport pandas as pd\nfrom pydataset import data\nimport matplotlib.pyplot as plt\nfrom mpl_toolkits.mplot3d import Axes3D\n\n\niris = pd.DataFrame(data('iris'))\niris = pd.DataFrame(iris.drop('Species', 1))\n\niris_train = iris.iloc[:, 0:4].head(120)\niris_test = iris.iloc[:, 0:4].tail(30)\n\nm = loop.LocalOutlierProbability(iris).fit()\nscores_noclust = m.local_outlier_probabilities\niris['scores'] = scores_noclust\n\nm_train = loop.LocalOutlierProbability(iris_train, n_neighbors=10)\nm_train.fit()\niris_train_scores = m_train.local_outlier_probabilities\n\niris_test_scores = []\nfor index, row in iris_test.iterrows():\n    array = np.array([row['Sepal.Length'], row['Sepal.Width'], row['Petal.Length'], row['Petal.Width']])\n    iris_test_scores.append(m_train.stream(array))\niris_test_scores = np.array(iris_test_scores)\n\niris['stream_scores'] = np.hstack((iris_train_scores, iris_test_scores))\n# iris['scores'] from earlier example\nrmse = np.sqrt(((iris['scores'] - iris['stream_scores']) ** 2).mean(axis=None))\nprint(rmse)\n\nfig = plt.figure(figsize=(7, 7))\nax = fig.add_subplot(111, projection='3d')\nax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],\nc=iris['stream_scores'], cmap='seismic', s=50)\nax.set_xlabel('Sepal.Width')\nax.set_ylabel('Petal.Width')\nax.set_zlabel('Sepal.Length')\nplt.show()\nplt.clf()\nplt.cla()\nplt.close()\n"
  },
  {
    "path": "paper/codemeta.json",
    "content": "{\n  \"@context\": \"https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld\",\n  \"@type\": \"Code\",\n  \"author\": [\n    {\n      \"@id\": \"http://orcid.org/0000-0002-5279-4143\",\n      \"@type\": \"Person\",\n      \"email\": \"vconstan@jpl.caltech.edu\",\n      \"name\": \"Valentino Constantinou\",\n      \"affiliation\": \"NASA Jet Propulsion Laboratory\"\n    }\n  ],\n  \"identifier\": \"\",\n  \"codeRepository\": \"https://www.github.com/vc1492a/PyNomaly\",\n  \"datePublished\": \"2018-05-07\",\n  \"dateModified\": \"2018-05-07\",\n  \"dateCreated\": \"2018-05-07\",\n  \"description\": \"Anomaly detection using Local Outlier Probabilities (LoOP).\",\n  \"keywords\": \"machine learning, unsupervised learning, outlier detection, anomaly detection, nearest neighbors, statistics, probability\",\n  \"license\": \"Apache 2.0\",\n  \"title\": \"PyNomaly\",\n  \"version\": \"v0.2.0\"\n}"
  },
  {
    "path": "paper/paper.bib",
    "content": "@inproceedings{Breunig,\n author = {Breunig, Markus M. and Kriegel, Hans-Peter and Ng, Raymond T. and Sander, J\\\"{o}rg},\n title = {LOF: Identifying Density-based Local Outliers},\n booktitle = {Proceedings of the 2000 ACM SIGMOD International Conference on Management of Data},\n series = {SIGMOD '00},\n year = {2000},\n isbn = {1-58113-217-4},\n location = {Dallas, Texas, USA},\n pages = {93--104},\n numpages = {12},\n url = {http://doi.acm.org/10.1145/342009.335388},\n doi = {10.1145/342009.335388},\n acmid = {335388},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {database mining, outlier detection},\n}\n\n@inproceedings{Kriegel,\n author = {Kriegel, Hans-Peter and Kr\\\"{o}ger, Peer and Schubert, Erich and Zimek, Arthur},\n title = {LoOP: Local Outlier Probabilities},\n booktitle = {Proceedings of the 18th ACM Conference on Information and Knowledge Management},\n series = {CIKM '09},\n year = {2009},\n isbn = {978-1-60558-512-3},\n location = {Hong Kong, China},\n pages = {1649--1652},\n numpages = {4},\n url = {http://doi.acm.org/10.1145/1645953.1646195},\n doi = {10.1145/1645953.1646195},\n acmid = {1646195},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {outlier detection},\n}\n\n@article{Hamlet,\n doi= {10.1080/23742917.2016.1226651},\n author = {Connor Hamlet and Jeremy Straub and Matthew Russell and Scott Kerlin},\n title = {An incremental and approximate local outlier probability algorithm for intrusion detection and its evaluation},\n journal = {Journal of Cyber Security Technology},\n volume = {1},\n number = {2},\n pages = {75-87},\n year  = {2017},\n publisher = {Taylor & Francis},\n doi = {10.1080/23742917.2016.1226651},\n URL = {https://doi.org/10.1080/23742917.2016.1226651},\n eprint = {https://doi.org/10.1080/23742917.2016.1226651}\n}"
  },
  {
    "path": "paper/paper.md",
    "content": "---\ntitle: 'PyNomaly: Anomaly detection using Local Outlier Probabilities (LoOP).'\ntags:\n  - outlier detection\n  - anomaly detection\n  - probability\n  - nearest neighbors\n  - unsupervised learning\n  - machine learning\n  - statistics\nauthors:\n - name: Valentino Constantinou\n   orcid: 0000-0002-5279-4143\n   affiliation: 1\naffiliations:\n - name: NASA Jet Propulsion Laboratory\n   index: 1\ndate: 7 May 2018\nbibliography: paper.bib\n---\n\n# Summary\n\n``PyNomaly`` is a Python 3 implementation of LoOP (Local Outlier\nProbabilities) [@Kriegel]. LoOP is a local density based outlier detection\nmethod by Kriegel, Kröger, Schubert, and Zimek which provides\noutlier scores in the range of [0,1] that are directly\ninterpretable as the probability of a sample being an outlier.\n``PyNomaly`` also implements a modified approach to LoOP [@Hamlet], which may be used for applications involving\nstreaming data or where rapid calculations may be necessary.\n\nThe outlier score of each sample is called the Local Outlier\nProbability. It measures the local deviation of density of a\ngiven sample with respect to its neighbors as Local Outlier\nFactor (LOF) [@Breunig], but provides normalized outlier scores in the\nrange [0,1]. These outlier scores are directly interpretable\nas a probability of an object being an outlier. Since Local\nOutlier Probabilities provides scores in the range [0,1],\npractitioners are free to interpret the results according to\nthe application.\n\nLike LOF, it is local in that the anomaly score depends on\nhow isolated the sample is with respect to the surrounding\nneighborhood. Locality is given by k-nearest neighbors,\nwhose distance is used to estimate the local density.\nBy comparing the local density of a sample to the local\ndensities of its neighbors, one can identify samples that\nlie in regions of lower density compared to their neighbors\nand thus identify samples that may be outliers according to\ntheir Local Outlier Probability.\n\n``PyNomaly`` includes an optional _cluster_labels_ parameter.\nThis is useful in cases where regions of varying density\noccur within the same set of data. When using _cluster_labels_,\nthe Local Outlier Probability of a sample is calculated with\nrespect to its cluster assignment.\n\n## Research\n\nPyNomaly is currently being used in the following research:\n\n- Y. Zhao and M.K. Hryniewicki, \"XGBOD: Improving Supervised\nOutlier Detection with Unsupervised Representation Learning,\"\nInternational Joint Conference on Neural Networks (IJCNN),\nIEEE, 2018.\n\n## Acknowledgements\n\nThe authors recognize the support of Kyle Hundman and Ian Colwell.\n\n# References"
  },
  {
    "path": "requirements.txt",
    "content": "numpy>=1.12.0\npython-utils>=2.3.0"
  },
  {
    "path": "requirements_ci.txt",
    "content": "coveralls>=1.8.0\npandas>=0.24.2\npytest>=4.6.2\npytest-cov>=2.7.1\nscikit-learn>=0.21.2\nscipy>=1.3.0\nwheel>=0.33.4"
  },
  {
    "path": "requirements_examples.txt",
    "content": "matplotlib==3.1.0\npandas>=0.24.2\npydataset>=0.2.0\nscikit-learn>=0.21.2\nscipy>=1.3.0"
  },
  {
    "path": "setup.py",
    "content": "from setuptools import setup\n\nfrom pathlib import Path\nthis_directory = Path(__file__).parent\nlong_description = (this_directory / \"README.md\").read_text()\n\nsetup(\n    name='PyNomaly',\n    packages=['PyNomaly'],\n    version='0.3.5',\n    description='A Python 3 implementation of LoOP: Local Outlier '\n                'Probabilities, a local density based outlier detection '\n                'method providing an outlier score in the range of [0,1].',\n    author='Valentino Constantinou',\n    author_email='vc@valentino.io',\n    long_description=long_description,\n    long_description_content_type='text/markdown',\n    url='https://github.com/vc1492a/PyNomaly',\n    download_url='https://github.com/vc1492a/PyNomaly/archive/0.3.5.tar.gz',\n    keywords=['outlier', 'anomaly', 'detection', 'machine', 'learning',\n              'probability'],\n    classifiers=[],\n    license='Apache License, Version 2.0',\n    install_requires=['numpy', 'python-utils']\n)\n"
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/test_loop.py",
    "content": "# Authors: Valentino Constantinou <vc@valentino.io>\n# License: Apache 2.0\n\nfrom PyNomaly import loop\nfrom PyNomaly.loop import ClusterSizeError, MissingValuesError\n\nimport logging\nfrom typing import Tuple\nimport numpy as np\nfrom numpy.testing import assert_array_equal, assert_array_almost_equal\nimport pandas as pd\nimport pytest\nfrom sklearn.datasets import load_iris\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.neighbors import NearestNeighbors\nfrom sklearn.utils import check_random_state\nimport sys\n\nlogging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n\n# flag to enable or disable NUMBA\nNUMBA = False\n\nif NUMBA is False:\n    logging.info(\n        \"Numba is disabled. Coverage statistics are reflective of \"\n        \"testing native Python code. Consider also testing with numba\"\n        \" enabled.\"\n    )\nelse:\n    logging.warning(\n        \"Numba is enabled. Coverage statistics will be impacted (reduced) to\"\n        \" due the just-in-time compilation of native Python code.\"\n    )\n\n# load the iris dataset\n# and randomly permute it\nrng = check_random_state(0)\niris = load_iris()\nperm = rng.permutation(iris.target.size)\niris.data = iris.data[perm]\niris.target = iris.target[perm]\n\n\n# fixtures\n@pytest.fixture()\ndef X_n8() -> np.ndarray:\n    \"\"\"\n    Fixture that generates a small Numpy array with two anomalous values\n    (last two observations).\n    :return: a Numpy array.\n    \"\"\"\n    # Toy sample (the last two samples are outliers):\n    X = np.array(\n        [[-2, -1], [-1, -1], [-1, -2], [1, 2], [1, 2], [2, 1], [5, 3], [-4, 2]]\n    )\n    return X\n\n\n@pytest.fixture()\ndef X_n20_scores() -> Tuple[np.ndarray, np.ndarray]:\n    \"\"\"\n    Fixture that returns a tuple containing a 20 element numpy array\n    and the precalculated loOP scores based on that array.\n    :return: tuple(input_data,exptected_scores)\n    \"\"\"\n    input_data = np.array(\n        [\n            0.02059752,\n            0.32629926,\n            0.63036653,\n            0.94409321,\n            0.63251097,\n            0.47598494,\n            0.80204026,\n            0.34845067,\n            0.81556468,\n            0.89183,\n            0.25210317,\n            0.11460502,\n            0.19953434,\n            0.36955067,\n            0.06038041,\n            0.34527368,\n            0.56621582,\n            0.90533649,\n            0.33773613,\n            0.71573306,\n        ]\n    )\n\n    expected_scores = np.array(\n        [\n            0.6356276742921594,\n            0.0,\n            0.0,\n            0.48490790006974044,\n            0.0,\n            0.0,\n            0.0,\n            0.0,\n            0.021728288376168012,\n            0.28285086151683225,\n            0.0,\n            0.18881886507113213,\n            0.0,\n            0.0,\n            0.45350246469681843,\n            0.0,\n            0.07886635748113013,\n            0.3349068501560546,\n            0.0,\n            0.0,\n        ]\n    )\n    return (input_data, expected_scores)\n\n\n@pytest.fixture()\ndef X_n120() -> np.ndarray:\n    \"\"\"\n    Fixture that generates a Numpy array with 120 observations. Each\n    observation contains two float values.\n    :return: a Numpy array.\n    \"\"\"\n    # Generate train/test data\n    rng = check_random_state(2)\n    X = 0.3 * rng.randn(120, 2)\n    return X\n\n\n@pytest.fixture()\ndef X_n140_outliers(X_n120) -> np.ndarray:\n    \"\"\"\n    Fixture that generates a Numpy array with 140 observations, where the\n    first 120 observations are \"normal\" and the last 20 considered anomalous.\n    :param X_n120: A pytest Fixture that generates the first 120 observations.\n    :return: A Numpy array.\n    \"\"\"\n    # Generate some abnormal novel observations\n    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))\n    X = np.r_[X_n120, X_outliers]\n    return X\n\n\n@pytest.fixture()\ndef X_n1000() -> np.ndarray:\n    \"\"\"\n    Fixture that generates a Numpy array with 1000 observations.\n    :return: A Numpy array.\n    \"\"\"\n    # Generate train/test data\n    rng = check_random_state(2)\n    X = 0.3 * rng.randn(1000, 2)\n    return X\n\n\ndef test_loop(X_n8) -> None:\n    \"\"\"\n    Tests the basic functionality and asserts that the anomalous observations\n    are detected as anomalies. Tests the functionality using inputs\n    as Numpy arrays and as Pandas dataframes.\n    :param X_n8: A pytest Fixture that generates the 8 observations.\n    :return: None\n    \"\"\"\n    # Test LocalOutlierProbability:\n    clf = loop.LocalOutlierProbability(X_n8, n_neighbors=5, use_numba=NUMBA)\n    score = clf.fit().local_outlier_probabilities\n    share_outlier = 2.0 / 8.0\n    predictions = [-1 if s > share_outlier else 1 for s in score]\n    assert_array_equal(predictions, 6 * [1] + 2 * [-1])\n\n    # Assert smallest outlier score is greater than largest inlier score:\n    assert np.min(score[-2:]) > np.max(score[:-2])\n\n    # Test the DataFrame functionality\n    X_df = pd.DataFrame(X_n8)\n\n    # Test LocalOutlierProbability:\n    clf = loop.LocalOutlierProbability(X_df, n_neighbors=5, use_numba=NUMBA)\n    score = clf.fit().local_outlier_probabilities\n    share_outlier = 2.0 / 8.0\n    predictions = [-1 if s > share_outlier else 1 for s in score]\n    assert_array_equal(predictions, 6 * [1] + 2 * [-1])\n\n    # Assert smallest outlier score is greater than largest inlier score:\n    assert np.min(score[-2:]) > np.max(score[:-2])\n\n\ndef test_regression(X_n20_scores) -> None:\n    \"\"\"\n    Tests for potential regression errors by comparing current results\n    to the exptected results. Any changes to the code should still return\n    the same result given the same dataset\n    \"\"\"\n    input_data, expected_scores = X_n20_scores\n    clf = loop.LocalOutlierProbability(input_data).fit()\n    scores = clf.local_outlier_probabilities\n    assert_array_almost_equal(scores, expected_scores, 6)\n\n\ndef test_loop_performance(X_n120) -> None:\n    \"\"\"\n    Using a set of known anomalies (labels), tests the performance (using\n    ROC / AUC score) of the software and ensures it is able to capture most\n    anomalies under this basic scenario.\n    :param X_n120: A pytest Fixture that generates the 120 observations.\n    :return: None\n    \"\"\"\n    # Generate some abnormal novel observations\n    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))\n    X_test = np.r_[X_n120, X_outliers]\n    X_labels = np.r_[np.repeat(1, X_n120.shape[0]), np.repeat(-1, X_outliers.shape[0])]\n\n    # fit the model\n    clf = loop.LocalOutlierProbability(\n        X_test,\n        n_neighbors=X_test.shape[0] - 1,\n        # test the progress bar\n        progress_bar=True,\n        use_numba=NUMBA,\n    )\n\n    # predict scores (the lower, the more normal)\n    score = clf.fit().local_outlier_probabilities\n    share_outlier = X_outliers.shape[0] / X_test.shape[0]\n    X_pred = [-1 if s > share_outlier else 1 for s in score]\n\n    # check that roc_auc is good\n    assert roc_auc_score(X_pred, X_labels) >= 0.98\n\n\ndef test_input_nodata(X_n140_outliers) -> None:\n    \"\"\"\n    Test to ensure that the proper warning is issued if no data is\n    provided.\n    :param X_n140_outliers: A pytest Fixture that generates 140 observations.\n    :return: None\n    \"\"\"\n    with pytest.warns(UserWarning) as record:\n        # attempt to fit loop without data or a distance matrix\n        loop.LocalOutlierProbability(\n            n_neighbors=X_n140_outliers.shape[0] - 1, use_numba=NUMBA\n        )\n\n    # check that only one warning was raised\n    assert len(record) == 1\n    # check that the message matches\n    assert record[0].message.args[0] == \"Data or a distance matrix must be provided.\"\n\n\ndef test_input_incorrect_type(X_n140_outliers) -> None:\n    \"\"\"\n    Test to ensure that the proper warning is issued if the type of an\n    argument is the incorrect type.\n    :param X_n140_outliers: A pytest Fixture that generates 140 observations.\n    :return: None\n    \"\"\"\n    with pytest.warns(UserWarning) as record:\n        # attempt to fit loop with a string input for n_neighbors\n        loop.LocalOutlierProbability(\n            X_n140_outliers,\n            n_neighbors=str(X_n140_outliers.shape[0] - 1),\n            use_numba=NUMBA,\n        )\n\n    # check that only one warning was raised\n    assert len(record) == 1\n    # check that the message matches\n    assert (\n        record[0].message.args[0]\n        == \"Argument 'n_neighbors' is not of type (<class 'int'>, \"\n        \"<class 'numpy.integer'>).\"\n    )\n\n\ndef test_input_neighbor_zero(X_n120) -> None:\n    \"\"\"\n    Test to ensure that the proper warning is issued if the neighbor size\n    is specified as 0 (must be greater than 0).\n    :param X_n120: A pytest Fixture that generates 120 observations.\n    :return: None\n    \"\"\"\n    clf = loop.LocalOutlierProbability(X_n120, n_neighbors=0, use_numba=NUMBA)\n\n    with pytest.warns(UserWarning) as record:\n        # attempt to fit loop with a 0 neighbor count\n        clf.fit()\n\n    # check that only one warning was raised\n    assert len(record) == 1\n    # check that the message matches\n    assert (\n        record[0].message.args[0]\n        == \"n_neighbors must be greater than 0. Fit with 10 instead.\"\n    )\n\n\ndef test_input_distonly(X_n120) -> None:\n    \"\"\"\n    Test to ensure that the proper warning is issued if only a distance\n    matrix is provided (without a neighbor matrix).\n    :param X_n120: A pytest Fixture that generates 120 observations.\n    :return: None\n    \"\"\"\n    # generate distance and neighbor indices\n    neigh = NearestNeighbors(metric=\"euclidean\")\n    neigh.fit(X_n120)\n    d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True)\n\n    with pytest.warns(UserWarning) as record:\n        # attempt to fit loop with a distance matrix and no neighbor matrix\n        loop.LocalOutlierProbability(distance_matrix=d, use_numba=NUMBA)\n\n    # check that only one warning was raised\n    assert len(record) == 1\n    # check that the message matches\n    assert (\n        record[0].message.args[0]\n        == \"A neighbor index matrix and distance matrix must both \"\n        \"be provided when not using raw input data.\"\n    )\n\n\ndef test_input_neighboronly(X_n120) -> None:\n    \"\"\"\n    Test to ensure that the proper warning is issued if only a neighbor\n    matrix is provided (without a distance matrix).\n    :param X_n120: A pytest Fixture that generates 120 observations.\n    :return: None\n    \"\"\"\n    # generate distance and neighbor indices\n    neigh = NearestNeighbors(metric=\"euclidean\")\n    neigh.fit(X_n120)\n    d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True)\n\n    with pytest.warns(UserWarning) as record:\n        # attempt to fit loop with a neighbor matrix and no distance matrix\n        loop.LocalOutlierProbability(neighbor_matrix=idx, use_numba=NUMBA)\n\n    # check that only one warning was raised\n    assert len(record) == 1\n    # check that the message matches\n    assert record[0].message.args[0] == \"Data or a distance matrix must be provided.\"\n\n\ndef test_input_too_many(X_n120) -> None:\n    \"\"\"\n    Test to ensure that the proper warning is issued if both a data matrix\n    and a distance matrix are provided (can only be data matrix).\n    :param X_n120: A pytest Fixture that generates 120 observations.\n    :return: None\n    \"\"\"\n    # generate distance and neighbor indices\n    neigh = NearestNeighbors(metric=\"euclidean\")\n    neigh.fit(X_n120)\n    d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True)\n\n    with pytest.warns(UserWarning) as record:\n        # attempt to fit loop with data and a distance matrix\n        loop.LocalOutlierProbability(\n            X_n120, distance_matrix=d, neighbor_matrix=idx, use_numba=NUMBA\n        )\n\n    # check that only one warning was raised\n    assert len(record) == 1\n    # check that the message matches\n    assert (\n        record[0].message.args[0]\n        == \"Only one of the following may be provided: data or a \"\n        \"distance matrix (not both).\"\n    )\n\n\ndef test_distance_neighbor_shape_mismatch(X_n120) -> None:\n    \"\"\"\n    Test to ensure that the proper warning is issued if there is a mismatch\n    between the shape of the provided distance and neighbor matrices.\n    :param X_n120: A pytest Fixture that generates 120 observations.\n    :return: None\n    \"\"\"\n    # generate distance and neighbor indices\n    neigh = NearestNeighbors(metric=\"euclidean\")\n    neigh.fit(X_n120)\n    d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True)\n\n    # generate distance and neighbor indices of a different shape\n    neigh_2 = NearestNeighbors(metric=\"euclidean\")\n    neigh_2.fit(X_n120)\n    d_2, idx_2 = neigh.kneighbors(X_n120, n_neighbors=5, return_distance=True)\n\n    with pytest.warns(UserWarning) as record:\n        # attempt to fit loop with a mismatch in shapes\n        loop.LocalOutlierProbability(\n            distance_matrix=d, neighbor_matrix=idx_2, n_neighbors=5, use_numba=NUMBA\n        )\n\n    # check that only one warning was raised\n    assert len(record) == 1\n    # check that the message matches\n    assert (\n        record[0].message.args[0] == \"The shape of the distance and neighbor \"\n        \"index matrices must match.\"\n    )\n\n\ndef test_input_neighbor_mismatch(X_n120) -> None:\n    \"\"\"\n    Test to ensure that the proper warning is issued if the supplied distance\n    (and neighbor) matrix and specified number of neighbors do not match.\n    :param X_n120: A pytest Fixture that generates 120 observations.\n    :return: None\n    \"\"\"\n    # generate distance and neighbor indices\n    neigh = NearestNeighbors(metric=\"euclidean\")\n    neigh.fit(X_n120)\n    d, idx = neigh.kneighbors(X_n120, n_neighbors=5, return_distance=True)\n\n    with pytest.warns(UserWarning) as record:\n        # attempt to fit loop with a neighbor size mismatch\n        loop.LocalOutlierProbability(\n            distance_matrix=d, neighbor_matrix=idx, n_neighbors=10, use_numba=NUMBA\n        )\n\n    # check that only one warning was raised\n    assert len(record) == 1\n    # check that the message matches\n    assert (\n        record[0].message.args[0] == \"The shape of the distance or \"\n        \"neighbor index matrix does not \"\n        \"match the number of neighbors \"\n        \"specified.\"\n    )\n\n\ndef test_loop_dist_matrix(X_n120) -> None:\n    \"\"\"\n    Tests to ensure the proper results are returned when supplying the\n    appropriate format distance and neighbor matrices.\n    :param X_n120: A pytest Fixture that generates 120 observations.\n    :return: None\n    \"\"\"\n    # generate distance and neighbor indices\n    neigh = NearestNeighbors(metric=\"euclidean\")\n    neigh.fit(X_n120)\n    d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True)\n\n    # fit loop using data and distance matrix\n    clf1 = loop.LocalOutlierProbability(X_n120, use_numba=NUMBA)\n    clf2 = loop.LocalOutlierProbability(\n        distance_matrix=d, neighbor_matrix=idx, use_numba=NUMBA\n    )\n    scores1 = clf1.fit().local_outlier_probabilities\n    scores2 = clf2.fit().local_outlier_probabilities\n\n    # compare the agreement between the results\n    assert np.abs(scores2 - scores1).all() <= 0.1\n\n\ndef test_lambda_values(X_n140_outliers) -> None:\n    \"\"\"\n    Test to ensure results are returned which correspond to what is expected\n    when varying the extent parameter (we expect larger extent values to\n    result in more constrained scores).\n    :param X_n140_outliers: A pytest Fixture that generates 140 observations.\n    :return: None\n    \"\"\"\n    # Fit the model with different extent (lambda) values\n    clf1 = loop.LocalOutlierProbability(X_n140_outliers, extent=1, use_numba=NUMBA)\n    clf2 = loop.LocalOutlierProbability(X_n140_outliers, extent=2, use_numba=NUMBA)\n    clf3 = loop.LocalOutlierProbability(X_n140_outliers, extent=3, use_numba=NUMBA)\n\n    # predict scores (the lower, the more normal)\n    score1 = clf1.fit().local_outlier_probabilities\n    score2 = clf2.fit().local_outlier_probabilities\n    score3 = clf3.fit().local_outlier_probabilities\n\n    # Get the mean of all the scores\n    score_mean1 = np.mean(score1)\n    score_mean2 = np.mean(score2)\n    score_mean3 = np.mean(score3)\n\n    # check that expected the means align with expectation\n    assert score_mean1 > score_mean2\n    assert score_mean2 > score_mean3\n\n\ndef test_parameters(X_n120) -> None:\n    \"\"\"\n    Test to ensure that the model object contains the needed attributes after\n    the model is fit. This is important in the context of the streaming\n    functionality.\n    :param X_n120: A pytest Fixture that generates 120 observations.\n    :return: None\n    \"\"\"\n    # fit the model\n    clf = loop.LocalOutlierProbability(X_n120, use_numba=NUMBA).fit()\n\n    # check that the model has attributes post fit\n    assert hasattr(clf, \"n_neighbors\") and clf.n_neighbors is not None\n    assert hasattr(clf, \"extent\") and clf.extent is not None\n    assert hasattr(clf, \"cluster_labels\") and clf._cluster_labels() is not None\n    assert hasattr(clf, \"prob_distances\") and clf.prob_distances is not None\n    assert hasattr(clf, \"prob_distances_ev\") and clf.prob_distances_ev is not None\n    assert (\n        hasattr(clf, \"norm_prob_local_outlier_factor\")\n        and clf.norm_prob_local_outlier_factor is not None\n    )\n    assert (\n        hasattr(clf, \"local_outlier_probabilities\")\n        and clf.local_outlier_probabilities is not None\n    )\n\n\ndef test_n_neighbors() -> None:\n    \"\"\"\n    Tests the functionality of providing a large number of neighbors that\n    is greater than the number of observations (software defaults to the\n    data input size and provides a UserWarning).\n    :return: None\n    \"\"\"\n    X = iris.data\n    clf = loop.LocalOutlierProbability(X, n_neighbors=500, use_numba=NUMBA).fit()\n    assert clf.n_neighbors == X.shape[0] - 1\n\n    clf = loop.LocalOutlierProbability(X, n_neighbors=500, use_numba=NUMBA)\n\n    with pytest.warns(UserWarning) as record:\n        clf.fit()\n\n    # check that only one warning was raised\n    assert len(record) == 1\n\n    assert clf.n_neighbors == X.shape[0] - 1\n\n\ndef test_extent() -> None:\n    \"\"\"\n    Test to ensure that a UserWarning is issued when providing an invalid\n    extent parameter value (can be 1, 2, or 3).\n    :return: None\n    \"\"\"\n    X = np.array([[1, 1], [1, 0]])\n    clf = loop.LocalOutlierProbability(X, n_neighbors=2, extent=4, use_numba=NUMBA)\n\n    with pytest.warns(UserWarning) as record:\n        clf.fit()\n\n    # check that only one warning was raised\n    assert len(record) == 1\n\n\ndef test_data_format() -> None:\n    \"\"\"\n    Test to ensure that a UserWarning is issued when the shape of the input\n    data is not explicitly correct. This is corrected by the software when\n    possible.\n    :return: None\n    \"\"\"\n    X = [1.3, 1.1, 0.9, 1.4, 1.5, 3.2]\n    clf = loop.LocalOutlierProbability(X, n_neighbors=3, use_numba=NUMBA)\n\n    with pytest.warns(UserWarning) as record:\n        clf.fit()\n\n    # check that only one warning was raised\n    assert len(record) == 1\n\n\ndef test_missing_values() -> None:\n    \"\"\"\n    Test to ensure that MissingValuesError is raised if a missing value is\n    encountered in the input data, as this is not allowable.\n    :return: None\n    \"\"\"\n    X = np.array([1.3, 1.1, 0.9, 1.4, 1.5, np.nan, 3.2])\n    clf = loop.LocalOutlierProbability(X, n_neighbors=3, use_numba=NUMBA)\n\n    with pytest.raises(MissingValuesError) as record:\n        clf.fit()\n\n    # check that the message matches\n    assert (\n        str(record.value)\n        == \"Method does not support missing values in input data.\"\n    )\n\n\ndef test_small_cluster_size(X_n140_outliers) -> None:\n    \"\"\"\n    Test to ensure that ClusterSizeError is raised when the specified number of\n    neighbors is larger than the smallest cluster size in the input data.\n    :param X_n140_outliers: A pytest Fixture that generates 140 observations.\n    :return: None\n    \"\"\"\n    # Generate cluster labels\n    a = [0] * 120\n    b = [1] * 18\n    cluster_labels = a + b\n\n    clf = loop.LocalOutlierProbability(\n        X_n140_outliers, n_neighbors=50, cluster_labels=cluster_labels, use_numba=NUMBA\n    )\n\n    with pytest.raises(ClusterSizeError) as record:\n        clf.fit()\n\n    # check that the message matches\n    assert (\n        str(record.value)\n        == \"Number of neighbors specified larger than smallest \"\n        \"cluster. Specify a number of neighbors smaller than \"\n        \"the smallest cluster size (observations in smallest \"\n        \"cluster minus one).\"\n    )\n\n\ndef test_stream_fit(X_n140_outliers) -> None:\n    \"\"\"\n    Test to ensure that the proper warning is issued if the user attempts\n    to use the streaming approach prior to the classical approach being fit.\n    :param X_n140_outliers: A pytest Fixture that generates 140 observations.\n    :return: None\n    \"\"\"\n    # Fit the model\n    X_train = X_n140_outliers[0:138]\n    X_test = X_n140_outliers[139]\n    clf = loop.LocalOutlierProbability(X_train, use_numba=NUMBA)\n\n    with pytest.warns(UserWarning) as record:\n        clf.stream(X_test)\n\n    # check that the message matches\n    messages = [i.message.args[0] for i in record]\n    assert (\n        \"Must fit on historical data by calling fit() prior to \"\n        \"calling stream(x).\" in messages\n    )\n\n\ndef test_stream_distance(X_n140_outliers) -> None:\n    \"\"\"\n    Test to ensure that the streaming approach functions as desired when\n    providing matrices for use and that the returned results are within some\n    margin of error when compared to the classical approach (using the RMSE).\n    :param X_n140_outliers: A pytest Fixture that generates 140 observations.\n    :return: None\n    \"\"\"\n    X_train = X_n140_outliers[0:100]\n    X_test = X_n140_outliers[100:140]\n\n    # generate distance and neighbor indices\n    neigh = NearestNeighbors(metric=\"euclidean\")\n    neigh.fit(X_train)\n    d, idx = neigh.kneighbors(X_train, n_neighbors=10, return_distance=True)\n\n    # Fit the models in standard and distance matrix form\n    m = loop.LocalOutlierProbability(X_train, use_numba=NUMBA).fit()\n    m_dist = loop.LocalOutlierProbability(\n        distance_matrix=d, neighbor_matrix=idx, use_numba=NUMBA\n    ).fit()\n\n    # Collect the scores\n    X_test_scores = []\n    for i in range(X_test.shape[0]):\n        X_test_scores.append(m.stream(np.array(X_test[i])))\n    X_test_scores = np.array(X_test_scores)\n\n    X_test_dist_scores = []\n    for i in range(X_test.shape[0]):\n        dd, ii = neigh.kneighbors(np.array([X_test[i]]), return_distance=True)\n        X_test_dist_scores.append(m_dist.stream(np.mean(dd)))\n    X_test_dist_scores = np.array(X_test_dist_scores)\n\n    # calculate the rmse and ensure score is below threshold\n    rmse = np.sqrt(((X_test_scores - X_test_dist_scores) ** 2).mean(axis=None))\n    assert 0.075 >= rmse\n\n\ndef test_stream_cluster(X_n140_outliers) -> None:\n    \"\"\"\n    Test to ensure that the proper warning is issued if the streaming approach\n    is called on clustered data, as the streaming approach does not support\n    this functionality.\n    :param X_n140_outliers: A pytest Fixture that generates 140 observations.\n    :return: None\n    \"\"\"\n    # Generate cluster labels\n    a = [0] * 120\n    b = [1] * 18\n    cluster_labels = a + b\n\n    # Fit the model\n    X_train = X_n140_outliers[0:138]\n    X_test = X_n140_outliers[139]\n    clf = loop.LocalOutlierProbability(\n        X_train, cluster_labels=cluster_labels, use_numba=NUMBA\n    ).fit()\n\n    with pytest.warns(UserWarning) as record:\n        clf.stream(X_test)\n\n    # check that only one warning was raised\n    assert len(record) == 1\n    # check that the message matches\n    assert (\n        record[0].message.args[0] == \"Stream approach does not support clustered data. \"\n        \"Automatically refit using single cluster of points.\"\n    )\n\n\ndef test_stream_performance(X_n140_outliers) -> None:\n    \"\"\"\n    Test to ensure that the streaming approach works as desired when using\n    a regular set of input data (no distance and neighbor matrices) and that\n    the result is within some expected level of error when compared to the\n    classical approach.\n    :param X_n140_outliers: A pytest Fixture that generates 140 observations.\n    :return:\n    \"\"\"\n    X_train = X_n140_outliers[0:100]\n    X_test = X_n140_outliers[100:140]\n\n    # Fit the models in standard and stream form\n    m = loop.LocalOutlierProbability(X_n140_outliers, use_numba=NUMBA).fit()\n    scores_noclust = m.local_outlier_probabilities\n\n    m_train = loop.LocalOutlierProbability(X_train, use_numba=NUMBA)\n    m_train.fit()\n    X_train_scores = m_train.local_outlier_probabilities\n\n    X_test_scores = []\n    for idx in range(X_test.shape[0]):\n        X_test_scores.append(m_train.stream(X_test[idx]))\n    X_test_scores = np.array(X_test_scores)\n\n    stream_scores = np.hstack((X_train_scores, X_test_scores))\n\n    # calculate the rmse and ensure score is below threshold\n    rmse = np.sqrt(((scores_noclust - stream_scores) ** 2).mean(axis=None))\n    assert 0.35 > rmse\n\n\ndef test_progress_bar(X_n8) -> None:\n    \"\"\"\n    Tests the progress bar functionality on a small number of observations,\n    when the number of observations is less than the width of the console\n    window.\n    :param X_n8: a numpy array with 8 observations.\n    :return: None\n    \"\"\"\n\n    # attempt to use the progress bar on a small number of observations\n    loop.LocalOutlierProbability(X_n8, use_numba=NUMBA, progress_bar=True).fit()\n\n\ndef test_data_flipping() -> None:\n    \"\"\"\n    Tests the flipping of data and cluster labels and ensures that the\n    :return: None\n    \"\"\"\n    np.random.seed(1)\n    n = 9\n    data = np.append(\n        np.random.normal(2, 1, [n, 2]), np.random.normal(8, 1, [n, 2]), axis=0\n    )\n    clus = np.append(np.ones(n), 2 * np.ones(n)).tolist()\n    model = loop.LocalOutlierProbability(data, n_neighbors=5, cluster_labels=clus)\n    fit = model.fit()\n    res = fit.local_outlier_probabilities\n\n    data_flipped = np.flipud(data)\n    clus_flipped = np.flipud(clus).tolist()\n    model2 = loop.LocalOutlierProbability(\n        data_flipped, n_neighbors=5, cluster_labels=clus_flipped\n    )\n    fit2 = model2.fit()\n    res2 = np.flipud(fit2.local_outlier_probabilities)\n\n    assert_array_almost_equal(res, res2, decimal=6)\n    assert_array_almost_equal(\n        fit.norm_prob_local_outlier_factor,\n        fit2.norm_prob_local_outlier_factor,\n        decimal=6,\n    )\n\n\ndef test_distance_matrix_consistency(X_n120) -> None:\n    \"\"\"\n    Test to ensure that the distance matrix is consistent with the neighbor\n    matrix and that the software is able to handle self-distances.\n    :return: None\n    \"\"\"\n\n    neigh = NearestNeighbors(metric='euclidean')\n    neigh.fit(X_n120)\n    distances, indices = neigh.kneighbors(X_n120, n_neighbors=11, return_distance=True)\n\n    # remove the closest neighbor (its the point itself) from each row in the indices matrix and distances matrix\n    indices = np.delete(indices, 0, 1)\n    distances = np.delete(distances, 0, 1)\n\n    # Fit LoOP with and without distance matrix\n    clf_data = loop.LocalOutlierProbability(X_n120, n_neighbors=10)\n    clf_dist = loop.LocalOutlierProbability(distance_matrix=distances, neighbor_matrix=indices, n_neighbors=11)\n\n    # Attempt to retrieve scores and check types\n    scores_data = clf_data.fit().local_outlier_probabilities\n    scores_dist = clf_dist.fit().local_outlier_probabilities\n\n    # Debugging prints to investigate types and contents\n    print(\"Type of scores_data:\", type(scores_data))\n    print(\"Type of scores_dist:\", type(scores_dist))\n    print(\"Value of scores_data:\", scores_data)\n    print(\"Value of scores_dist:\", scores_dist)\n    print(\"Shape of scores_data:\", scores_data.shape)\n    print(\"Shape of scores_dist:\", scores_dist.shape)\n\n    # Convert to arrays if they aren't already\n    scores_data = np.array(scores_data) if not isinstance(scores_data, np.ndarray) else scores_data\n    scores_dist = np.array(scores_dist) if not isinstance(scores_dist, np.ndarray) else scores_dist\n\n    # Check shapes and types before assertion\n    assert scores_data.shape == scores_dist.shape, \"Score shapes mismatch\"\n    assert isinstance(scores_data, np.ndarray), \"Expected scores_data to be a numpy array\"\n    assert isinstance(scores_dist, np.ndarray), \"Expected scores_dist to be a numpy array\"\n\n    # Compare scores allowing for minor floating-point differences\n    assert_array_almost_equal(scores_data, scores_dist, decimal=10, err_msg=\"Inconsistent LoOP scores due to self-distances\")\n"
  }
]