Showing preview only (1,063K chars total). Download the full file or copy to clipboard to get everything.
Repository: KaveIO/PhiK
Branch: master
Commit: 4574c81f0d6b
Files: 57
Total size: 1.0 MB
Directory structure:
gitextract_19gb0anp/
├── .gitattributes
├── .github/
│ ├── dependabot.yml
│ └── workflows/
│ ├── test_matrix.yml
│ ├── tests.yml
│ ├── valgrind.yml
│ └── wheels.yml
├── .gitignore
├── .mbuild.sh
├── .readthedocs.yml
├── CHANGES.rst
├── CMakeLists.txt
├── LICENSE
├── NOTICE
├── README.rst
├── docs/
│ ├── Makefile
│ ├── README.rst
│ ├── autogenerate.sh
│ └── source/
│ ├── code.rst
│ ├── conf.py
│ ├── developing.rst
│ ├── index.rst
│ ├── introduction.rst
│ ├── phik.decorators.rst
│ ├── phik.rst
│ ├── phik_index.rst
│ ├── publication.rst
│ └── tutorials.rst
├── example.py
├── phik/
│ ├── __init__.py
│ ├── betainc.py
│ ├── binning.py
│ ├── bivariate.py
│ ├── data_quality.py
│ ├── decorators/
│ │ ├── __init__.py
│ │ └── pandas.py
│ ├── definitions.py
│ ├── entry_points.py
│ ├── notebooks/
│ │ ├── phik_tutorial_advanced.ipynb
│ │ ├── phik_tutorial_basic.ipynb
│ │ └── phik_tutorial_spark.ipynb
│ ├── outliers.py
│ ├── phik.py
│ ├── report.py
│ ├── resources.py
│ ├── significance.py
│ ├── simcore/
│ │ ├── __init__.py
│ │ ├── asa159.cpp
│ │ ├── asa159.hpp
│ │ ├── bindings.cpp
│ │ └── simulation.hpp
│ ├── simulation.py
│ ├── statistics.py
│ └── utils.py
├── pyproject.toml
└── tests/
├── integration/
│ ├── test_phik_tutorial_advanced.py
│ └── test_phik_tutorial_basic.py
└── test_phik.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
phik/notebooks/* linguist-vendored
================================================
FILE: .github/dependabot.yml
================================================
---
version: 2
updates:
- package-ecosystem: pip
directory: /
# Check for updates once a day
schedule:
interval: daily
allow:
- dependency-type: all
- package-ecosystem: github-actions
directory: /
# Check for updates once a week
schedule:
interval: weekly
================================================
FILE: .github/workflows/test_matrix.yml
================================================
name: Test Matrix
on:
workflow_dispatch:
pull_request:
push:
branches:
- master
jobs:
build:
name: ${{ matrix.platform }} Python ${{ matrix.python-version }}
strategy:
fail-fast: false
matrix:
platform: [windows-latest, macos-latest, ubuntu-latest]
python-version: ["3.9", "3.10", "3.11", "3.12"]
runs-on: ${{ matrix.platform }}
steps:
- uses: actions/checkout@v6
with:
submodules: true
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: Add requirements
run: |
python -m pip install --upgrade pip wheel
- name: Build and install
run: pip install --verbose ".[test]"
- name: Unit test
run: |
cd tests
pytest test_phik.py -v -W ignore::DeprecationWarning
- name: Integration test
run: |
cd tests
pytest integration -v -W ignore::DeprecationWarning
================================================
FILE: .github/workflows/tests.yml
================================================
name: Test
on: push
jobs:
tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install . -v
pip install "pytest>=4.0.2" "pytest-pylint>=0.13.0"
- name: Test with pytest
run: |
cd tests
pytest test_phik.py -W ignore::DeprecationWarning
================================================
FILE: .github/workflows/valgrind.yml
================================================
name: Valgrind
on:
pull_request:
branches:
- master
workflow_dispatch:
defaults:
run:
shell: bash
jobs:
pre_job:
# continue-on-error: true # Uncomment once integration is finished
runs-on: ubuntu-latest
# Map a step output to a job output
outputs:
should_skip: ${{ steps.skip_check.outputs.should_skip }}
steps:
- id: skip_check
uses: fkirc/skip-duplicate-actions@master
with:
# All of these options are optional, so you can remove them if you are happy with the defaults
cancel_others: 'true'
do_not_skip: '["pull_request", "workflow_dispatch", "schedule"]'
build:
name: Valgrind
needs: pre_job
if: ${{ needs.pre_job.outputs.should_skip != 'true' }}
runs-on: ubuntu-latest
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v6
with:
submodules: false
- uses: actions/setup-python@v6
with:
python-version: '3.10'
- name: Install dependencies on ubuntu
run: |
sudo apt-get update
sudo apt-get install -y valgrind
- name: Install python packages
run: |
python -m pip install --upgrade pip pytest
- name: Install
run: |
# temp fix for Valgrind issue with later versions
pip install scipy==1.9.1
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" pip install . -v
- name: Test
run: |
cd tests
PYTHONMALLOC=malloc valgrind --leak-check=yes --track-origins=yes --log-file=valgrind-log.txt python -m pytest test_phik.py -W ignore::DeprecationWarning
================================================
FILE: .github/workflows/wheels.yml
================================================
name: Wheels
on:
workflow_dispatch:
pull_request:
push:
branches:
- master
release:
types:
- published
jobs:
make_sdist:
name: Make SDist
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Build SDist
run: pipx run build --sdist
- uses: actions/upload-artifact@v7
with:
name: artifact-sdist
path: dist/*.tar.gz
build_wheels:
name: Wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
steps:
- uses: actions/checkout@v6
with:
submodules: true
- uses: actions/setup-python@v6
with:
python-version: "3.12"
- uses: pypa/cibuildwheel@v3.3.1
env:
CIBW_ENVIRONMENT: MACOSX_DEPLOYMENT_TARGET=10.13
CIBW_BUILD: 'cp38-* cp39-* cp310-* cp311-* cp312-* cp313-* cp314-*'
CIBW_TEST_EXTRAS: test
CIBW_TEST_COMMAND: pytest {project}/tests/test_phik.py -W ignore::DeprecationWarning
CIBW_ARCHS: "auto64"
CIBW_ARCHS_MACOS: "x86_64 arm64"
# Skip 32-bit builds
CIBW_SKIP: "*-win32 *-manylinux_i686 *-musllinux_x86_64"
- name: Show files
run: ls -lh wheelhouse
shell: bash
- name: Verify clean directory
run: git diff --exit-code
shell: bash
- name: Upload wheels
uses: actions/upload-artifact@v7
with:
name: artifact-${{ matrix.os }}
path: wheelhouse/*.whl
upload_all:
needs: [build_wheels, make_sdist]
runs-on: ubuntu-latest
if: github.event_name == 'release' && github.event.action == 'published'
steps:
- uses: actions/download-artifact@v8
with:
pattern: artifact-*
merge-multiple: true
path: dist
- uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.pypi_password }}
================================================
FILE: .gitignore
================================================
*.so
*egg-info*
================================================
FILE: .mbuild.sh
================================================
cmake -S . -G Ninja -B build \
-DCMAKE_BUILD_TYPE=Release \
-DSKBUILD_PROJECT_NAME="phik" \
-DSKBUILD_PROJECT_VERSION="0.12.4" \
-DPHIK_MBUILD=ON \
-DPython3_EXECUTABLE=$(python3 -c 'import sys; print(sys.executable)') \
-Dpybind11_DIR=$(python3 -c 'import pybind11; print(pybind11.get_cmake_dir())') \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
cmake --build build --target install --config Release --parallel 4
================================================
FILE: .readthedocs.yml
================================================
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# .readthedocs.yml
# Required
version: 2
# Set the version of Python
build:
os: ubuntu-22.04
tools:
python: "3.10"
python:
install:
- method: pip
path: .
extra_requirements:
- doc
================================================
FILE: CHANGES.rst
================================================
=============
Release notes
=============
Version 0.12.5, Jul 2025
------------------------
- FIX: scipy 1.16.0 no longer supports mvn, code now migrated to qmvn.
https://github.com/KaveIO/PhiK/issues/101
https://github.com/KaveIO/PhiK/pull/102
- Drop support for Python 3.8, has reached end of life.
Version 0.12.4, Jan 2024
------------------------
- Add support for Python 3.12.
- ENH: added plotting kwargs to correlation_report function.
https://github.com/KaveIO/PhiK/issues/58
- FIX: fix of bin edge values they are rounded with 1e-14
https://github.com/KaveIO/PhiK/issues/60
- FIX: numpy random multinomial requires integer number of samples (for nixOS)
https://github.com/KaveIO/PhiK/issues/73
- FIX: pandas deprecation warning
https://github.com/KaveIO/PhiK/pull/74
- Drop support for Python 3.7, has reached end of life.
Version 0.12.3, Dec 2022
------------------------
- Add support for Python 3.11
Version 0.12.2, Mar 2022
------------------------
- Fix missing setup.py and pyproject.toml in source distribution
- Support wheels ARM MacOS (Apple silicone)
Version 0.12.1, Mar 2022
------------------------
- Two fixes to make calculation of global phik robust: global phik capped in range [0, 1],
and check for successful correlation matrix inversion.
- Migration to to scikit-build 0.13.1.
- Support wheels for Python 3.10.
Version 0.12.0, July 2021
-------------------------
C++ Extension
~~~~~~~~~~~~~
Phi_K contains an optional C++ extension to compute the significance matrix using the `hypergeometric` method
(also called the`Patefield` method).
Note that the PyPi distributed wheels contain a pre-build extension for Linux, MacOS and Windows.
A manual (pip) setup will attempt to build and install the extension, if it fails it will install without the extension.
If so, using the `hypergeometric` method without the extension will trigger a
NotImplementedError.
Compiler requirements through Pybind11:
- Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or newer)
- GCC 4.8 or newer
- Microsoft Visual Studio 2015 Update 3 or newer
- Intel classic C++ compiler 18 or newer (ICC 20.2 tested in CI)
- Cygwin/GCC (previously tested on 2.5.1)
- NVCC (CUDA 11.0 tested in CI)
- NVIDIA PGI (20.9 tested in CI)
Other
~~~~~
* You can now manually set the number of parallel jobs in the evaluation of Phi_K or its statistical significance
(when using MC simulations). For example, to use 4 parallel jobs do:
.. code-block:: python
df.phik_matrix(njobs = 4)
df.significance_matrix(njobs = 4)
The default value is -1, in which case all available cores are used. When using ``njobs=1`` no parallel processing
is applied.
* Phi_K can now be calculated with an independent expectation histogram:
.. code-block:: python
from phik.phik import phik_from_hist2d
cols = ["mileage", "car_size"]
interval_cols = ["mileage"]
observed = df1[["feature1", "feature2"]].hist2d()
expected = df2[["feature1", "feature2"]].hist2d()
phik_value = phik_from_hist2d(observed=observed, expected=expected)
The expected histogram is taken to be (relatively) large in number of counts
compared with the observed histogram.
Or can compare two (pre-binned) datasets against each other directly. Again the expected dataset
is assumed to be relatively large:
.. code-block:: python
from phik.phik import phik_observed_vs_expected_from_rebinned_df
phik_matrix = phik_observed_vs_expected_from_rebinned_df(df1_binned, df2_binned)
* Added links in the readme to the basic and advanced Phi_K tutorials on google colab.
* Migrated the spark example Phi_K notebook from popmon to directly using histogrammar for histogram creation.
Older versions
--------------
* Please see documentation for full details: https://phik.readthedocs.io
================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.17...3.27)
# Scikit-build-core sets these values for you, or you can just hard-code the
# name and version.
project(
${SKBUILD_PROJECT_NAME}
VERSION ${SKBUILD_PROJECT_VERSION}
DESCRIPTION "C++ bindings for simulation RXC tables"
LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 14)
# Define CMAKE_INSTALL_xxx: LIBDIR, INCLUDEDIR
include(GNUInstallDirs)
find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
find_package(pybind11 CONFIG REQUIRED)
set(SUBPATH ${PROJECT_SOURCE_DIR}/phik/simcore/)
# ##############################################################################
# build ASA159 library #
# ##############################################################################
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
endif()
add_library(_asa159 OBJECT ${SUBPATH}/asa159.cpp)
target_include_directories(_asa159 PRIVATE ${SUBPATH})
# ##############################################################################
# EXECUTABLE #
# ##############################################################################
pybind11_add_module(_phik_simulation_core MODULE ${SUBPATH}/bindings.cpp
${SUBPATH}/simulation.hpp $<TARGET_OBJECTS:_asa159>)
target_compile_definitions(_phik_simulation_core
PRIVATE VERSION_INFO=${SKBUILD_PROJECT_VERSION})
target_include_directories(
_phik_simulation_core PUBLIC $<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${SUBPATH}>)
if(PHIK_MBUILD)
set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}")
endif()
install(TARGETS _phik_simulation_core LIBRARY DESTINATION "${PROJECT_NAME}/lib")
# Quiet a warning, since this project is only valid with SKBUILD
set(ignoreMe "${SKBUILD}")
================================================
FILE: LICENSE
================================================
##############################################################################
#
# Copyright 2016 KPMG Advisory N.V. (unless otherwise stated)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
##############################################################################
================================================
FILE: NOTICE
================================================
################################################################################################
#
# NOTICE: pass-through licensing of bundled components
#
# PhiK gathers together a toolkit of pre-existing third-party open-source software components.
# These software components are governed by their own licenses which PhiK does not
# modify or supersede, please consult the originating authors. These components altogether
# have a mixture of the following licenses: Apache 2.0, GPL 2.0, AGPL and LGPL, ZPL, MIT, PSF,
# BSD and some BSD-like simple licenses.
# For scipy and numpy see: http://docs.continuum.io/anaconda/licenses.html .
#
# Although we have examined the licenses to verify acceptance of commercial and non-commercial
# use, please see and consult the original licenses or authors.
#
################################################################################################
================================================
FILE: README.rst
================================================
==========================
Phi_K Correlation Constant
==========================
* Version: 0.12.5. Released: Jul 2025
* Release notes: https://github.com/KaveIO/PhiK/blob/master/CHANGES.rst
* Repository: https://github.com/kaveio/phik
* Documentation: https://phik.readthedocs.io
* Publication: `[offical] <https://www.sciencedirect.com/science/article/abs/pii/S0167947320301341>`_ `[arxiv pre-print] <https://arxiv.org/abs/1811.11440>`_
Phi_K is a practical correlation constant that works consistently between categorical, ordinal and interval variables.
It is based on several refinements to Pearson's hypothesis test of independence of two variables. Essentially, the
contingency test statistic of two variables is interpreted as if coming from a rotated bi-variate normal distribution,
where the tilt is interpreted as Phi_K.
The combined features of Phi_K form an advantage over existing coefficients. First, it works consistently between categorical, ordinal and interval variables.
Second, it captures non-linear dependency. Third, it reverts to the Pearson correlation coefficient in case of a bi-variate normal input distribution.
These are useful features when studying the correlation matrix of variables with mixed types.
For details on the methodology behind the calculations, please see our publication. Emphasis is paid to the proper evaluation of statistical significance of correlations and to the interpretation of variable relationships
in a contingency table, in particular in case of low statistics samples.
The presented algorithms are easy to use and available through this public Python library.
Example notebooks
=================
.. list-table::
:widths: 60 40
:header-rows: 1
* - Static link
- Google Colab link
* - `basic tutorial <https://nbviewer.jupyter.org/github/KaveIO/PhiK/blob/master/phik/notebooks/phik_tutorial_basic.ipynb>`_
- `basic on colab <https://colab.research.google.com/github/KaveIO/PhiK/blob/master/phik/notebooks/phik_tutorial_basic.ipynb>`_
* - `advanced tutorial (detailed configuration) <https://nbviewer.jupyter.org/github/KaveIO/PhiK/blob/master/phik/notebooks/phik_tutorial_advanced.ipynb>`_
- `advanced on colab <https://colab.research.google.com/github/KaveIO/PhiK/blob/master/phik/notebooks/phik_tutorial_advanced.ipynb>`_
* - `spark tutorial <https://nbviewer.jupyter.org/github/KaveIO/PhiK/blob/master/phik/notebooks/phik_tutorial_spark.ipynb>`_
- no spark available
Documentation
=============
The entire Phi_K documentation including tutorials can be found at `read-the-docs <https://phik.readthedocs.io>`_.
See the tutorials for detailed examples on how to run the code with pandas. We also have one example on how
calculate the Phi_K correlation matrix for a spark dataframe.
Check it out
============
The Phi_K library requires Python >= 3.8 and is pip friendly. To get started, simply do:
.. code-block:: bash
$ pip install phik
or check out the code from out GitHub repository:
.. code-block:: bash
$ git clone https://github.com/KaveIO/PhiK.git
$ pip install -e PhiK/
where in this example the code is installed in edit mode (option -e).
You can now use the package in Python with:
.. code-block:: python
import phik
**Congratulations, you are now ready to use the PhiK correlation analyzer library!**
Quick run
=========
As a quick example, you can do:
.. code-block:: python
import pandas as pd
import phik
from phik import resources, report
# open fake car insurance data
df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )
df.head()
# Pearson's correlation matrix between numeric variables (pandas functionality)
df.corr()
# get the phi_k correlation matrix between all variables
df.phik_matrix()
# get global correlations based on phi_k correlation matrix
df.global_phik()
# get the significance matrix (expressed as one-sided Z)
# of the hypothesis test of each variable-pair dependency
df.significance_matrix()
# contingency table of two columns
cols = ['mileage','car_size']
df[cols].hist2d()
# normalized residuals of contingency test applied to cols
df[cols].outlier_significance_matrix()
# show the normalized residuals of each variable-pair
df.outlier_significance_matrices()
# generate a phik correlation report and save as test.pdf
report.correlation_report(df, pdf_file_name='test.pdf')
For all available examples, please see the `tutorials <https://phik.readthedocs.io/en/latest/tutorials.html>`_ at read-the-docs.
Contact and support
===================
* Issues and Ideas: https://github.com/kaveio/phik/issues
Please note that support is (only) provided on a best-effort basis.
================================================
FILE: docs/Makefile
================================================
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " applehelp to make an Apple Help Book"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
@echo " coverage to run coverage check of the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/DecisionEngine.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/DecisionEngine.qhc"
applehelp:
$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
@echo
@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
@echo "N.B. You won't be able to view it unless you put it in" \
"~/Library/Documentation/Help or install it in your application" \
"bundle."
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/DecisionEngine"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/DecisionEngine"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
coverage:
$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
@echo "Testing of coverage in the sources finished, look at the " \
"results in $(BUILDDIR)/coverage/python.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
================================================
FILE: docs/README.rst
================================================
Generating Documentation with Sphinx
====================================
This README is for generating and writing documentation using Sphinx.
On the repository there should already be the auto-generated files
along with the regular documentation.
Installing Sphinx
-----------------
First install Sphinx. Go to http://www.sphinx-doc.org/en/stable/ or run
::
pip install -U Sphinx
pip install -U sphinx-rtd-theme
conda install -c conda-forge nbsphinx
The eskapade/docs folder has the structure of a Sphinx project.
However, if you want to make a new Sphinx project run:
::
sphinx-quickstart
It quickly generates a conf.py file which contains your configuration
for your sphinx build.
Update the HTML docs
--------------------
Now we want Sphinx to autogenerate from docstrings and other
documentation in the code base. Luckily Sphinx has the apidoc
functionality. This goes through a path, finds all the python files and
depending on your arguments, parses certain parts of the code
(docstring, hidden classes, etc.).
**First make sure your environment it setup properly. Python must be
able to import all modules otherwise it will not work!**
From the the root of the repository:
::
$ source setup.sh
To run the autogeneration of the documentation type in /docs/:
::
./autogenerate.sh
to scan the pyfiles and generate \*.rst files with the documentation.
The script itself contains the usage of apidoc.
Now to make the actual documentation files run:
::
make clean
to clean up the old make of sphinx and run:
::
make html
to make the new html build. It will be stored in (your config can adjust
this, but the default is:) docs/build/html/ The index.html is the
starting page. Open this file to see the result.
Mounting a different repository to vagrant
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When you want to develop code that is not part of the repository that
your vagrant is in, you can mount it seperately. This is done by changing
the Vagrantfile, by changing the ``#mount`` line to the path of the repository
that you want to mount:
::
config.vm.synced_folder "<PATH_TO_REPOSITORY>", "<LOCATION_TO_MOUNT>", id: "esrepo"
where the location to mount is e.g. /opt/eskapade.
What is an .rst file?
~~~~~~~~~~~~~~~~~~~~~
R(e)ST is the format that Sphinx uses it stands for ReSTructured
(http://docutils.sourceforge.net/docs/user/rst/quickref.html). It looks
for other RST files to import, see index.rst to see how the **toctree**
refers to other files.
================================================
FILE: docs/autogenerate.sh
================================================
#!/bin/bash
# (re)create required directories
rm -rf autogen
mkdir -p source/_static autogen
# auto-generate code documentation
sphinx-apidoc -f -H PhiK -o autogen ../python/phik
mv autogen/modules.rst autogen/phik_index.rst
mv autogen/* source/
# remove auto-gen directory
rm -rf autogen
================================================
FILE: docs/source/code.rst
================================================
API Documentation
=================
.. toctree::
:maxdepth: 2
phik_index
================================================
FILE: docs/source/conf.py
================================================
# -*- coding: utf-8 -*-
#
# PhiK documentation build configuration file for sphinx.
#
#
import os
#from unittest.mock import MagicMock
import phik
# Classes that use non-python modules are not always available in the
# RTD environment. By mocking them we can still import these classes
# in the code and RTD can subsequently go through the code and get
# the docstrings.
#class Mock(MagicMock):
# @classmethod
# def __getattr__(cls, name):
# return MagicMock()
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
# sys.path.insert(0, os.path.abspath(''))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.mathjax',
'sphinx.ext.ifconfig',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The encoding of source files.
# source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = 'Phi_K correlation library'
copyright = '2018, KPMG Advisory N.V.'
author = 'KPMG Advanced Analytics & Big Data team'
version = phik.__version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = 'en'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['*test*', 'phik.tutorials.*']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
# -- Options for HTML output ----------------------------------------------
# on_rtd is whether we are on readthedocs.org, this line of code grabbed from docs.readthedocs.org
on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
if not on_rtd:
import sphinx_rtd_theme
html_theme = "sphinx_rtd_theme"
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# otherwise, readthedocs.org uses their theme by default, so no need to specify it
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# If false, no index is generated.
html_use_index = True
# If true, the index is split into individual pages for each letter.
# html_split_index = False
# If true, links to the reST sources are added to the pages.
html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
html_show_copyright = True
# Language to be used for generating the HTML full-text search index.
# Sphinx supports the following languages:
# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
html_search_language = 'en'
# Output file base name for HTML help builder.
htmlhelp_basename = 'PhiKdoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
# 'preamble': '',
# Latex figure (float) alignment
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'PhiK.tex', 'PhiK Documentation',
'KPMG Advanced Analytics & Big Data team', 'manual'),
]
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'phik', 'PhiK Documentation',
[author], 1)
]
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'PhiK', 'PhiK Documentation',
author, 'PhiK', 'One line description of project.',
'Miscellaneous'),
]
def skip(app, what, name, obj, skip, options):
if name == "__init__":
return False
return skip
def setup(app):
app.connect("autodoc-skip-member", skip)
================================================
FILE: docs/source/developing.rst
================================================
===========================
Developing and Contributing
===========================
Working on the package
----------------------
You have some cool feature and/or algorithm you want to add to the package. How do you go about it?
First clone the package.
.. code-block:: bash
git clone https://github.com/KaveIO/PhiK.git
then
.. code-block:: bash
pip install -e PhiK/
this will install ``PhiK`` in editable mode, which will allow you to edit the code and run it as
you would with a normal installation of the ``PhiK`` correlation analyzer package.
To make sure that everything works try executing the tests, e.g.
.. code-block:: bash
cd PhiK/
phik_trial .
or
.. code-block:: bash
cd PhiK/
python setup.py test
That's it.
Contributing
------------
When contributing to this repository, please first discuss the change you wish to make via issue, email, or any
other method with the owners of this repository before making a change. You can find the contact information on the
`index <index.html>`_ page.
Note that when contributing that all tests should succeed.
Tips and Tricks
---------------
- Enable auto reload in ``jupyter``:
.. code-block:: python
%load_ext autoreload
this will reload modules before executing any user code.
================================================
FILE: docs/source/index.rst
================================================
.. PhiK documentation master file, created by
sphinx-quickstart on Thu Jul 7 14:25:54 2016.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
.. include:: ../../README.rst
Contents
========
.. toctree::
:maxdepth: 2
introduction
tutorials
publication
developing
API
---
.. toctree::
:maxdepth: 1
code
Indices and tables
------------------
* :ref:`genindex`
* :ref:`modindex`
================================================
FILE: docs/source/introduction.rst
================================================
======================
Why did we build this?
======================
When exploring a data set, for example to model one variable in terms of the others,
it is useful to summarize the dependencies between the variables, assess their significances, and
visualize the individual variable dependencies.
The ``PhiK`` correlation analyzer library contains several useful functions to help one do so.
* This library implements a novel correlation coefficient, :math:`\phi_{K}`, with properties that - taken together - form
an advantage over existing methods.
The calculation of correlation coefficients between paired data variables is a standard tool of analysis for every data analyst.
Pearson's correlation coefficient is a de facto standard in most fields, but by construction only works for interval variables
(sometimes called continuous variables). Pearson is unsuitable for data sets with mixed variable types,
e.g. where some variables are ordinal or categorical.
While many correlation coefficients exist, each with different features, we have not been able to find a
correlation coefficient with Pearson-like characteristics
and a sound statistical interpretation that works for interval, ordinal and categorical variable types alike.
The correlation coefficient :math:`\phi_{K}` follows a uniform treatment for interval, ordinal and categorical variables,
captures non-linear dependencies, and is similar to Pearson's correlation coefficient in case of a bivariate normal input distribution.
* We found that, by default, popular analysis libraries such ``R`` and ``scipy`` make incorrect ("asymptotic") assumptions when assessing
the statistical significance of the :math:`\chi^2` contingency test of variable independence. In particular, the actual number of
degrees of freedom and the shape of the test statistic distribution can differ significantly from their theoretical
predictions in case of low to medium statistics data samples. This leads to incorrect p-values for the hypothesis test of variable
independence. A prescription has been implemented to fix these two mistakes.
* Visualizing the dependency between variables can be tricky, especially when dealing with (unordered) categorical variables.
To help interpret any variable relationship found, we provide a method for the detection of
significant excesses or deficits of records with respect to the expected values in a contingency table, so-called outliers,
using a statistically independent evaluation for expected frequency of records, accounting for the uncertainty on the expectation.
We evaluate the significance of each outlier frequency in a table, and normalize and visualize these accordingly.
The resulting plots we find to be very valuable to help interpret variable dependencies,
and work alike for interval, ordinal and categorical variables.
The ``PhiK`` analysis library is particularly useful in modern-day analysis when studying the dependencies between a set of
variables with mixed types, where often some variables are categorical.
The package has been used by us to study surveys, insurance claims, correlograms, etc.
For details on the methodology behind the calculations, please see our publication.
For the available examples on how to use the methods, please see the `tutorials <tutorials.html>`_ section.
================================================
FILE: docs/source/phik.decorators.rst
================================================
phik.decorators package
=======================
Submodules
----------
phik.decorators.pandas module
-----------------------------
.. automodule:: phik.decorators.pandas
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: phik.decorators
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/phik.rst
================================================
phik package
============
Subpackages
-----------
.. toctree::
phik.decorators
Submodules
----------
phik.betainc module
-------------------
.. automodule:: phik.betainc
:members:
:undoc-members:
:show-inheritance:
phik.binning module
-------------------
.. automodule:: phik.binning
:members:
:undoc-members:
:show-inheritance:
phik.bivariate module
---------------------
.. automodule:: phik.bivariate
:members:
:undoc-members:
:show-inheritance:
phik.data\_quality module
-------------------------
.. automodule:: phik.data_quality
:members:
:undoc-members:
:show-inheritance:
phik.definitions module
-----------------------
.. automodule:: phik.definitions
:members:
:undoc-members:
:show-inheritance:
phik.entry\_points module
-------------------------
.. automodule:: phik.entry_points
:members:
:undoc-members:
:show-inheritance:
phik.outliers module
--------------------
.. automodule:: phik.outliers
:members:
:undoc-members:
:show-inheritance:
phik.phik module
----------------
.. automodule:: phik.phik
:members:
:undoc-members:
:show-inheritance:
phik.report module
------------------
.. automodule:: phik.report
:members:
:undoc-members:
:show-inheritance:
phik.resources module
---------------------
.. automodule:: phik.resources
:members:
:undoc-members:
:show-inheritance:
phik.significance module
------------------------
.. automodule:: phik.significance
:members:
:undoc-members:
:show-inheritance:
phik.simulation module
----------------------
.. automodule:: phik.simulation
:members:
:undoc-members:
:show-inheritance:
phik.statistics module
----------------------
.. automodule:: phik.statistics
:members:
:undoc-members:
:show-inheritance:
phik.version module
-------------------
.. automodule:: phik.version
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: phik
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/phik_index.rst
================================================
PhiK
====
.. toctree::
:maxdepth: 4
phik
================================================
FILE: docs/source/publication.rst
================================================
===================
Publication & Talks
===================
Publication
-----------
* peer-reviewed: https://www.sciencedirect.com/science/article/abs/pii/S0167947320301341
* arXiv pre-print: https://arxiv.org/abs/1811.11440
Talks
-----
* Coming soon.
Cite as
-------
Baak, M., Koopman, R., Snoek, H., & Klous, S. (2020). A new correlation coefficient between categorical, ordinal and interval variables with Pearson characteristics. *Computational Statistics & Data Analysis*, 152, 107043.
.. code-block:: latex
@article{phik2020,
title={A new correlation coefficient between categorical, ordinal and interval variables with Pearson characteristics},
author={Baak, M and Koopman, R and Snoek, H and Klous, S},
journal={Computational Statistics \& Data Analysis},
volume={152},
pages={107043},
year={2020},
publisher={Elsevier}
}
References
----------
* Web page: https://phik.readthedocs.io
* Repository: https://github.com/kaveio/phik
* Issues & Ideas: https://github.com/kaveio/phik/issues
* Contact us at: kave [at] kpmg [dot] com
================================================
FILE: docs/source/tutorials.rst
================================================
=========
Tutorials
=========
This section contains materials on how to use the Phi_K correlation analysis code.
There are additional side notes on how certain aspects work and where to find parts of the code.
For more in depth explanations on the functionality of the code-base, try the `API docs <phik_index.html>`_.
The tutorials are available in the ``phik/notebooks`` directory. We have:
* A basic tutorial: this covers the basics of calculating Phi_K, the statistical significance, and interpreting the correlation.
* An advanced tutorial: this shows how to use the advanced features of the ``PhiK`` library.
* A spark tutorial: this shows how to calculate the Phi_K correlation matrix for a spark dataframe.
You can open these notebooks directly:
* Run them interactively at `MyBinder <https://mybinder.org/v2/gh/KaveIO/PhiK/master?filepath=phik%2Fnotebooks>`_.
* View them statically: `basic tutorial <http://nbviewer.ipython.org/urls/raw.github.com/kaveio/phik/master/phik/notebooks/phik_tutorial_basic.ipynb>`_ and the `advanced tutorial <http://nbviewer.ipython.org/urls/raw.github.com/kaveio/phik/master/phik/notebooks/phik_tutorial_advanced.ipynb>`_ and the `spark tutorial <http://nbviewer.ipython.org/urls/raw.github.com/kaveio/phik/master/phik/notebooks/phik_tutorial_spark.ipynb>`_.
================================================
FILE: example.py
================================================
import pandas as pd
import phik
from phik import resources, report
# open fake car insurance data
df = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )
df.head()
# Pearson's correlation matrix between numeric variables (pandas functionality)
df.corr()
# get the phi_k correlation matrix between all variables
df.phik_matrix()
# get global correlations based on phi_k correlation matrix
df.global_phik()
# get the significance matrix (expressed as one-sided Z)
# of the hypothesis test of each variable-pair dependency
df.significance_matrix()
# contingency table of two columns
cols = ['mileage', 'car_size']
df[cols].hist2d()
# normalized residuals of contingency test applied to cols
df[cols].outlier_significance_matrix()
# show the normalized residuals of each variable-pair
df.outlier_significance_matrices()
# generate a phik correlation report and save as test.pdf
report.correlation_report(df, pdf_file_name='test.pdf')
================================================
FILE: phik/__init__.py
================================================
# flake8: noqa
import importlib.metadata
from phik import decorators
from phik.outliers import (
outlier_significance_from_array,
outlier_significance_matrices,
outlier_significance_matrix,
)
from phik.phik import global_phik_array, phik_from_array, phik_matrix
from phik.significance import significance_from_array, significance_matrix
__version__ = importlib.metadata.version("phik")
__all__ = [
"decorators",
"phik_from_array",
"significance_from_array",
"outlier_significance_from_array",
"phik_matrix",
"global_phik_array",
"significance_matrix",
"outlier_significance_matrices",
"outlier_significance_matrix",
]
================================================
FILE: phik/betainc.py
================================================
"""Project: PhiK - correlation analyzer library
Created: 2018/09/05
Description:
Implementation of incomplete beta function
Authors:
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
"""
import numpy as np
from scipy.special import gammaln
from typing import Tuple
def contfractbeta(
a: float, b: float, x: float, ITMAX: int = 5000, EPS: float = 1.0e-7
) -> float:
"""Continued fraction form of the incomplete Beta function.
Code translated from: Numerical Recipes in C.
Example kindly taken from blog:
https://malishoaib.wordpress.com/2014/04/15/the-beautiful-beta-functions-in-raw-python/
:param float a: a
:param float b: b
:param float x: x
:param int ITMAX: max number of iterations, default is 5000.
:param float EPS: epsilon precision parameter, default is 1e-7.
:returns: continued fraction form
:rtype: float
"""
az = 1.0
bm = 1.0
am = 1.0
qab = a + b
qap = a + 1.0
qam = a - 1.0
bz = 1.0 - qab * x / qap
for i in range(ITMAX + 1):
em = float(i + 1)
tem = em + em
d = em * (b - em) * x / ((qam + tem) * (a + tem))
ap = az + d * am
bp = bz + d * bm
d = -(a + em) * (qab + em) * x / ((qap + tem) * (a + tem))
app = ap + d * az
bpp = bp + d * bz
aold = az
am = ap / bpp
bm = bp / bpp
az = app / bpp
bz = 1.0
if abs(az - aold) < EPS * abs(az):
return az
raise ValueError(
"a={0:f} or b={1:f} too large, or ITMAX={2:d} too small to compute incomplete beta function.".format(
a, b, ITMAX
)
)
def incompbeta(a: float, b: float, x: float) -> float:
"""Evaluation of incomplete beta function.
Code translated from: Numerical Recipes in C.
Here a, b > 0 and 0 <= x <= 1.
This function requires contfractbeta(a,b,x, ITMAX = 200)
Example kindly taken from blog:
https://malishoaib.wordpress.com/2014/04/15/the-beautiful-beta-functions-in-raw-python/
:param float a: a
:param float b: b
:param float x: x
:returns: incomplete beta function
:rtype: float
"""
# special cases
if x == 0:
return 0
elif x == 1:
return 1
# default
lbeta = gammaln(a + b) - gammaln(a) - gammaln(b) + a * np.log(x) + b * np.log(1 - x)
if x < (a + 1) / (a + b + 2):
p = np.exp(lbeta) * contfractbeta(a, b, x) / a
else:
p = 1 - np.exp(lbeta) * contfractbeta(b, a, 1 - x) / b
return p
def log_incompbeta(a: float, b: float, x: float) -> Tuple[float, float]:
"""Evaluation of logarithm of incomplete beta function
Logarithm of incomplete beta function is implemented to ensure sufficient precision
for values very close to zero and one.
Code translated from: Numerical Recipes in C.
Here a, b > 0 and 0 <= x <= 1.
This function requires contfractbeta(a,b,x, ITMAX = 200)
Example kindly taken from blog:
https://malishoaib.wordpress.com/2014/04/15/the-beautiful-beta-functions-in-raw-python/
:param float a: a
:param float b: b
:param float x: x
:returns: tuple of log(incb) and log(1-incb)
:rtype: tuple
"""
# special cases
if x == 0:
return -np.inf, 0
elif x == 1:
return 0, -np.inf
# default
lbeta = gammaln(a + b) - gammaln(a) - gammaln(b) + a * np.log(x) + b * np.log(1 - x)
if x < (a + 1) / (a + b + 2):
p = np.exp(lbeta) * contfractbeta(a, b, x) / a
logp = lbeta + np.log(contfractbeta(a, b, x)) - np.log(a)
logq = np.log(1 - p)
else:
p = 1 - np.exp(lbeta) * (contfractbeta(b, a, 1 - x) / b)
logp = np.log(p)
logq = lbeta + np.log(contfractbeta(b, a, 1 - x)) - np.log(b)
return logp, logq
================================================
FILE: phik/binning.py
================================================
"""Project: PhiK - correlation analyzer library
Created: 2018/09/06
Description:
A set of rebinning functions, to help rebin two lists into a 2d histogram.
Authors:
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
"""
import sys
from typing import List, Optional, Tuple, Union
import numpy as np
import pandas as pd
from phik import definitions as defs
from phik.data_quality import dq_check_nunique_values
from phik.utils import array_like_to_dataframe, guess_interval_cols
def bin_edges(
arr: Union[np.ndarray, list, pd.Series], nbins: int, quantile: bool = False
) -> np.ndarray:
"""
Create uniform or quantile bin-edges for the input array.
:param arr: array like object with input data
:param int nbins: the number of bin
:param bool quantile: uniform bins (False) or bins based on quantiles (True)
:returns: array with bin edges
"""
if quantile:
quantiles = np.linspace(0, 1, nbins + 1)
xbins = np.quantile(arr[~np.isnan(arr)], quantiles)
xbins[0] -= max(1e-14 * abs(xbins[0]), sys.float_info.min)
else:
min_value = np.min(arr[~np.isnan(arr)])
constant = max(1e-14 * abs(min_value), sys.float_info.min)
xbins = np.linspace(
min_value - constant, np.max(arr[~np.isnan(arr)]), nbins + 1
)
return xbins
def bin_array(
arr: Union[np.ndarray, list], bin_edges: Union[np.ndarray, list]
) -> Tuple[np.ndarray, list]:
"""
Index the data given the bin_edges.
Underflow and overflow values are indicated.
:param arr: array like object with input data
:param bin_edges: list with bin edges.
:returns: indexed data
"""
# Bin data
binned_arr = np.searchsorted(bin_edges, arr).astype(object)
# Check if all bins are filled and store bin-labels
bin_labels = []
bin_indices = pd.Series(binned_arr).value_counts().index
for i in range(1, len(bin_edges)):
if i in bin_indices:
bin_labels.append((bin_edges[i - 1], bin_edges[i]))
# NaN values are added to the overflow bin. Restore NaN values:
binned_arr[np.argwhere(np.isnan(arr))] = np.nan
# Set underflow values to UF
binned_arr[np.argwhere(binned_arr == 0)] = defs.UF
# Set overflow values to OF
binned_arr[np.argwhere(binned_arr == len(bin_edges))] = defs.OF
return binned_arr, bin_labels
def bin_data(
data: pd.DataFrame,
cols: Union[list, np.ndarray, tuple] = (),
bins: Union[int, list, np.ndarray, dict] = 10,
quantile: bool = False,
retbins: bool = False,
):
"""
Index the input DataFrame given the bin_edges for the columns specified in cols.
:param DataFrame data: input data
:param list cols: list of columns with numeric data which needs to be indexed
:param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
:param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
:returns: rebinned DataFrame
:rtype: pandas.DataFrame
"""
xbins = None
if isinstance(bins, dict):
for col in cols:
if col not in bins:
raise ValueError(
"column {0} is not included in bins dictionary.".format(col)
)
elif isinstance(bins, (list, np.ndarray)):
xbins = bins
# MB 20210307: check for numeric bins turned off here, also done in dq_check_nunique_values().
binned_data = data.copy()
bins_dict = {}
for col in cols:
if np.issubdtype(type(bins), np.integer) or np.issubdtype(
type(bins), np.floating
):
xbins = bin_edges(data[col].astype(float), int(bins), quantile=quantile)
elif isinstance(bins, dict):
if np.issubdtype(type(bins[col]), np.integer) or np.issubdtype(
type(bins[col]), np.floating
):
xbins = bin_edges(
data[col].astype(float), int(bins[col]), quantile=quantile
)
elif isinstance(bins[col], (list, np.ndarray)):
xbins = bins[col]
elif xbins is None:
raise ValueError(
"Unexpected type for bins. The found type was '%s'" % str(type(bins))
)
binned_data[col], bin_labels = bin_array(data[col].astype(float).values, xbins)
if retbins:
bins_dict[col] = bin_labels
if retbins:
return binned_data, bins_dict
return binned_data
def auto_bin_data(
df: pd.DataFrame,
interval_cols: Optional[list] = None,
bins: Union[int, list, np.ndarray, dict] = 10,
quantile: bool = False,
dropna: bool = True,
verbose: bool = True,
) -> pd.DataFrame:
"""
Index the input DataFrame with automatic bin_edges and interval columns.
:param pd.DataFrame data_binned: input data
:param list interval_cols: column names of columns with interval variables.
:param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column
the bins are specified. (default=10)\
E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
:param quantile: when bins is an integer, uniform bins (False) or bins based on quantiles (True)
:param bool dropna: remove NaN values with True
:param bool verbose: if False, do not print all interval columns that are guessed
:return: phik correlation matrix
"""
# guess interval columns
if interval_cols is None:
interval_cols = guess_interval_cols(df, verbose)
# clean the data
df_clean, interval_cols_clean = dq_check_nunique_values(
df, interval_cols, dropna=dropna
)
# perform rebinning
data_binned, binning_dict = bin_data(
df_clean, cols=interval_cols_clean, bins=bins, quantile=quantile, retbins=True
)
return data_binned, binning_dict
def create_correlation_overview_table(
vals: List[Tuple[str, str, float]]
) -> pd.DataFrame:
"""
Create overview table of phik/significance data.
:param list vals: list holding tuples of data for each variable pair formatted as ('var1', 'var2', value)
:returns: symmetric table with phik/significances of all variable pairs
:rtype: pandas.DataFrame
"""
ll = []
for c0, c1, v in vals:
ll.append([c0, c1, v])
ll.append([c1, c0, v])
corr_matrix = pd.DataFrame(ll, columns=["var1", "var2", "vals"]).pivot_table(
index="var1", columns="var2", values="vals"
)
corr_matrix.columns.name = None
corr_matrix.index.name = None
return corr_matrix
def hist2d_from_rebinned_df(
data_binned: pd.DataFrame,
dropna: bool = True,
drop_underflow: bool = True,
drop_overflow: bool = True,
) -> pd.DataFrame:
"""
Give binned 2d DataFrame of two columns of rebinned input DataFrame
:param df: input data. DataFrame must contain exactly two columns
:param bool dropna: remove NaN values with True
:param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
a numeric variable)
:param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
a numeric variable)
:returns: histogram DataFrame
"""
c0, c1 = data_binned.columns
if not dropna:
data_binned.fillna(defs.NaN, inplace=True)
if drop_underflow:
data_binned.replace(defs.UF, np.nan, inplace=True)
if drop_overflow:
data_binned.replace(defs.OF, np.nan, inplace=True)
# create a contingency table
df_datahist = (
data_binned.groupby([c0, c1])[c0].count().to_frame().unstack().fillna(0)
)
df_datahist.columns = df_datahist.columns.droplevel()
return df_datahist
def hist2d(
df: pd.DataFrame,
interval_cols: Optional[Union[list, np.ndarray]] = None,
bins: Union[int, float, list, np.ndarray, dict] = 10,
quantile: bool = False,
dropna: bool = True,
drop_underflow: bool = True,
drop_overflow: bool = True,
retbins: bool = False,
verbose: bool = True,
) -> Union[pd.DataFrame, Tuple[pd.DataFrame, dict]]:
"""
Give binned 2d DataFrame of two columns of input DataFrame
:param df: input data. DataFrame must contain exactly two columns
:param interval_cols: columns with interval variables which need to be binned
:param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
:param bool quantile: when the number of bins is specified, use uniform binning (False) or quantile binning (True)
:param bool dropna: remove NaN values with True
:param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
a numeric variable)
:param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
a numeric variable)
:param bool verbose: if False, do not print all interval columns that are guessed
:returns: histogram DataFrame
"""
if len(df.columns) != 2:
raise ValueError("DataFrame should contain only two columns")
if interval_cols is None:
interval_cols = guess_interval_cols(df, verbose)
data_binned, binning_dict = bin_data(
df, interval_cols, retbins=True, bins=bins, quantile=quantile
)
datahist = hist2d_from_rebinned_df(
data_binned,
dropna=dropna,
drop_underflow=drop_underflow,
drop_overflow=drop_overflow,
)
if retbins:
return datahist, binning_dict
return datahist
def hist2d_from_array(
x: Union[pd.Series, list, np.ndarray], y: [pd.Series, list, np.ndarray], **kwargs
) -> Union[pd.DataFrame, Tuple[pd.DataFrame, dict]]:
"""
Give binned 2d DataFrame of two input arrays
:param x: input data. First array-like.
:param y: input data. Second array-like.
:param interval_cols: columns with interval variables which need to be binned
:param bins: number of bins, or a list of bin edges (same for all columns), or a dictionary where per column the bins are specified. (default=10)\
E.g.: bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}
:param bool quantile: when the number of bins is specified, use uniform binning (False) or quantile binning (True)
:param bool dropna: remove NaN values with True
:param bool drop_underflow: do not take into account records in underflow bin when True (relevant when binning\
a numeric variable)
:param bool drop_overflow: do not take into account records in overflow bin when True (relevant when binning\
a numeric variable)
:returns: histogram DataFrame
"""
df = array_like_to_dataframe(x, y)
return hist2d(df, **kwargs)
================================================
FILE: phik/bivariate.py
================================================
"""Project: PhiK - correlation analyzer library
Created: 2019/11/23
Description:
Convert Pearson correlation value into a chi2 value of a contingency test
matrix of a bivariate gaussian, and vice-versa.
Calculation uses scipy's mvn library.
Authors:
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
"""
import warnings
import numpy as np
import scipy
from scipy import optimize
_scipy_version = [int(v) for v in scipy.__version__.split('.')]
USE_QMVN = True if _scipy_version[0] >= 1 and _scipy_version[1] >= 16 else False
if USE_QMVN:
from scipy.stats._qmvnt import _qauto, _qmvn
else:
from scipy.stats._mvn import mvnun
def _mvn_un(rho: float, lower: tuple, upper: tuple,
rng: np.random.Generator = np.random.default_rng(42)) -> float:
"""Perform integral of bivariate normal gauss with correlation
Integral is performed using scipy's mvn library.
:param float rho: tilt parameter
:param tuple lower: tuple of lower corner of integral area
:param tuple upper: tuple of upper corner of integral area
:param np.random.Generator rng: default_rng(42), optional
:returns float: integral value
"""
mu = np.array([0.0, 0.0])
S = np.array([[1.0, rho], [rho, 1.0]])
return _calc_mvnun(lower=lower, upper=upper, mu=mu, S=S, rng=rng)
def _calc_mvnun(lower, upper, mu, S, rng = np.random.default_rng(42)):
if USE_QMVN:
res = _qauto(_qmvn, S, lower, upper, rng)[0]
else:
res = mvnun(lower, upper, mu, S)[0]
return res
def _mvn_array(rho: float, sx: np.ndarray, sy: np.ndarray) -> list:
"""Array of integrals over bivariate normal gauss with correlation
Integrals are performed using scipy's mvn library.
:param float rho: tilt parameter
:param np.ndarray sx: bin edges array of x-axis
:param np.ndarray sy: bin edges array of y-axis
:returns list: list of integral values
"""
# ranges = [([sx[i], sy[j]], [sx[i+1], sy[j+1]]) for i in range(len(sx) - 1) for j in range(len(sy) - 1)]
# corr = [mvn.mvnun(lower, upper, mu, S)[0] for lower, upper in ranges]
# return corr
# mean and covariance
mu = np.array([0.0, 0.0])
S = np.array([[1.0, rho], [rho, 1.0]])
# callling mvn.mvnun is expensive, so we only calculate half of the matrix, then symmetrize
# add half block, which is symmetric in x
odd_odd = False
ranges = [
([sx[i], sy[j]], [sx[i + 1], sy[j + 1]])
for i in range((len(sx) - 1) // 2)
for j in range(len(sy) - 1)
]
# add odd middle row, which is symmetric in y
if (len(sx) - 1) % 2 == 1:
i = (len(sx) - 1) // 2
ranges += [
([sx[i], sy[j]], [sx[i + 1], sy[j + 1]]) for j in range((len(sy) - 1) // 2)
]
# add center point, add this only once
if (len(sy) - 1) % 2 == 1:
j = (len(sy) - 1) // 2
ranges.append(([sx[i], sy[j]], [sx[i + 1], sy[j + 1]]))
odd_odd = True
corr = np.array([_calc_mvnun(lower, upper, mu, S) for lower, upper in ranges])
# add second half, exclude center
corr = np.concatenate([corr, corr if not odd_odd else corr[:-1]])
return corr
def bivariate_normal_theory(
rho: float,
nx: int = -1,
ny: int = -1,
n: int = 1,
sx: np.ndarray = None,
sy: np.ndarray = None,
) -> np.ndarray:
"""Return binned pdf of bivariate normal distribution.
This function returns a "perfect" binned bivariate normal distribution.
:param float rho: tilt parameter
:param int nx: number of uniform bins on x-axis. alternative to sx.
:param int ny: number of uniform bins on y-axis. alternative to sy.
:param np.ndarray sx: bin edges array of x-axis. default is None.
:param np.ndarray sy: bin edges array of y-axis. default is None.
:param int n: number of entries. default is one.
:return: np.ndarray of binned bivariate normal pdf
"""
if n < 1:
raise ValueError("Number of entries needs to be one or greater.")
if sx is None:
sx = np.linspace(-5, 5, nx + 1)
if sy is None:
sy = np.linspace(-5, 5, ny + 1)
bvn = np.zeros((ny, nx))
for i in range(len(sx) - 1):
for j in range(len(sy) - 1):
lower = (sx[i], sy[j])
upper = (sx[i + 1], sy[j + 1])
p = _mvn_un(rho, lower, upper)
bvn[j, i] = p
bvn *= n
# patch for entry levels that are below machine precision
# (simulation does not work otherwise)
bvn[bvn < np.finfo(np.float).eps] = np.finfo(np.float).eps
return bvn
def chi2_from_phik(
rho: float,
n: int,
subtract_from_chi2: float = 0,
corr0: list = None,
scale: float = None,
sx: np.ndarray = None,
sy: np.ndarray = None,
pedestal: float = 0,
nx: int = -1,
ny: int = -1,
) -> float:
"""Calculate chi2-value of bivariate gauss having correlation value rho
Calculate no-noise chi2 value of bivar gauss with correlation rho,
with respect to bivariate gauss without any correlation.
:param float rho: tilt parameter
:param int n: number of records
:param float subtract_from_chi2: value subtracted from chi2 calculation. default is 0.
:param list corr0: mvn_array result for rho=0. Default is None.
:param float scale: scale is multiplied with the chi2 if set.
:param np.ndarray sx: bin edges array of x-axis. default is None.
:param np.ndarray sy: bin edges array of y-axis. default is None.
:param float pedestal: pedestal is added to the chi2 if set.
:param int nx: number of uniform bins on x-axis. alternative to sx.
:param int ny: number of uniform bins on y-axis. alternative to sy.
:returns float: chi2 value
"""
if sx is None:
sx = np.linspace(-5, 5, nx + 1)
if sy is None:
sy = np.linspace(-5, 5, ny + 1)
if corr0 is None:
corr0 = _mvn_array(0, sx, sy)
if scale is None:
# scale ensures that for rho=1, chi2 is the maximum possible value
corr1 = _mvn_array(1, sx, sy)
delta_corr2 = (corr1 - corr0) ** 2
# protect against division by zero
ratio = np.divide(
delta_corr2, corr0, out=np.zeros_like(delta_corr2), where=corr0 != 0
)
chi2_one = n * np.sum(ratio)
# chi2_one = n * sum([((c1-c0)*(c1-c0)) / c0 for c0, c1 in zip(corr0, corr1)])
chi2_max = n * min(nx - 1, ny - 1)
scale = (chi2_max - pedestal) / chi2_one
corrr = _mvn_array(rho, sx, sy)
delta_corr2 = (corrr - corr0) ** 2
# protect against division by zero
ratio = np.divide(
delta_corr2, corr0, out=np.zeros_like(delta_corr2), where=corr0 != 0
)
chi2_rho = n * np.sum(ratio)
# chi2_rho = (n * sum([((cr-c0)*(cr-c0)) / c0 for c0, cr in zip(corr0, corrr)]))
chi2 = pedestal + chi2_rho * scale
return chi2 - subtract_from_chi2
def phik_from_chi2(
chi2: float,
n: int,
nx: int,
ny: int,
sx: np.ndarray = None,
sy: np.ndarray = None,
pedestal: float = 0,
) -> float:
"""
Correlation coefficient of bivariate gaussian derived from chi2-value
Chi2-value gets converted into correlation coefficient of bivariate gauss
with correlation value rho, assuming giving binning and number of records.
Correlation coefficient value is between 0 and 1.
Bivariate gaussian's range is set to [-5,5] by construction.
:param float chi2: input chi2 value
:param int n: number of records
:param int nx: number of uniform bins on x-axis. alternative to sx.
:param int ny: number of uniform bins on y-axis. alternative to sy.
:param np.ndarray sx: bin edges array of x-axis. default is None.
:param np.ndarray sy: bin edges array of y-axis. default is None.
:param float pedestal: pedestal is added to the chi2 if set.
:returns float: correlation coefficient
"""
if pedestal < 0:
raise ValueError("noise pedestal should be greater than zero.")
if sx is None:
sx = np.linspace(-5, 5, nx + 1)
elif nx <= 1:
raise ValueError("number of bins along x-axis is unknown")
if sy is None:
sy = np.linspace(-5, 5, ny + 1)
elif ny <= 1:
raise ValueError("number of bins along y-axis is unknown")
corr0 = _mvn_array(0, sx, sy)
# scale ensures that for rho=1, chi2 is the maximum possible value
corr1 = _mvn_array(1, sx, sy)
if 0 in corr0 and len(corr0) > 10000:
warnings.warn(
"Many cells: {0:d}. Are interval variables set correctly?".format(
len(corr0)
)
)
delta_corr2 = (corr1 - corr0) ** 2
# protect against division by zero
ratio = np.divide(
delta_corr2, corr0, out=np.zeros_like(delta_corr2), where=corr0 != 0
)
chi2_one = n * np.sum(ratio)
# chi2_one = n * sum([((c1-c0)*(c1-c0)) / c0 if c0 > 0 else 0 for c0,c1 in zip(corr0,corr1)])
chi2_max = n * min(nx - 1, ny - 1)
scale = (chi2_max - pedestal) / chi2_one
if chi2 > chi2_max and np.isclose(chi2, chi2_max, atol=1e-14):
chi2 = chi2_max
# only solve for rho if chi2 exceeds noise pedestal
if chi2 <= pedestal:
return 0.0
elif chi2 >= chi2_max:
return 1.0
rho = optimize.brentq(
chi2_from_phik, 0, 1, args=(n, chi2, corr0, scale, sx, sy, pedestal), xtol=1e-5
)
return rho
================================================
FILE: phik/data_quality.py
================================================
"""Project: PhiK - correlation analyzer library
Created: 2018/12/28
Description:
A set of functions to check for data quality issues in input data.
Authors:
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
"""
import warnings
import copy
from typing import Tuple
import pandas as pd
import numpy as np
def dq_check_nunique_values(
df: pd.DataFrame, interval_cols: list, dropna: bool = True
) -> Tuple[pd.DataFrame, list]:
"""
Basic data quality checks per column in a DataFrame.
The following checks are done:
1. For all non-interval variables, if the number of unique values per variable is larger than 100 a warning is printed.
When the number of unique values is large, the variable is likely to be an interval variable. Calculation of phik
will be slow(ish) for pairs of variables where one (or two) have many different values (i.e. many bins).
2. For all interval variables, the number of unique values must be at least two. If the number of unique values is
zero (i.e. all NaN) the column is removed. If the number of unique values is one, it is not possible to
automatically create a binning for this variable (as min and max are the same). The variable is therefore dropped,
irrespective of whether dropna is True or False.
3. For all non-interval variables, the number of unique values must be at least either
a) 1 if dropna=False (NaN is now also considered a valid category), or
b) 2 if dropna=True
The function returns a DataFrame where all columns with invalid data are removed. Also the list of interval_cols
is updated and returned.
:param pd.DataFrame df: input data
:param list interval_cols: column names of columns with interval variables.
:param bool dropna: remove NaN values when True
:returns: cleaned data, updated list of interval columns
"""
# check for existing columns
interval_cols = [col for col in interval_cols if col in df.columns]
# check non-interval variable for number of unique values
for col in sorted(list(set(df.columns) - set(interval_cols))):
if df[col].nunique() > 1000:
warnings.warn(
"The number of unique values of variable {0:s} is large: {1:d}. Are you sure this is "
"not an interval variable? Analysis for pairs of variables including {0:s} can be slow.".format(
col, df[col].nunique()
)
)
drop_cols = []
# check for interval values whether there are at least two unique values (otherwise I cannot bin automatically)
for col in interval_cols:
if df[col].nunique() < 2:
drop_cols.append(col)
warnings.warn(
"Not enough unique value for variable {0:s} for analysis {1:d}. Dropping this column".format(
col, df[col].nunique()
)
)
# check non-interval values whether there are at least two different values OR 1 value and NaN if dropna==False
for col in sorted(list(set(df.columns) - set(interval_cols))):
if df[col].nunique() == 0 or (df[col].nunique() == 1 and dropna):
drop_cols.append(col)
warnings.warn(
"Not enough unique value for variable {0:s} for analysis {1:d}. Dropping this column".format(
col, df[col].nunique()
)
)
df_clean = df.copy()
interval_cols_clean = copy.copy(interval_cols)
if len(drop_cols) > 0:
# preserves column order: https://github.com/KaveIO/PhiK/issues/1
df_clean.drop(columns=drop_cols, inplace=True)
interval_cols_clean = [col for col in interval_cols if col not in drop_cols]
return df_clean, interval_cols_clean
def dq_check_hist2d(hist2d: np.ndarray) -> bool:
"""Basic data quality checks for a contingency table
The Following checks are done:
1. There must be at least two bins in both the x and y direction.
2. If the number of bins in the x and/or y direction is larger than 100 a warning is printed.
:param hist2d: contingency table
:return: bool passed_check
"""
if 0 in hist2d.shape or 1 in hist2d.shape:
warnings.warn(
"Too few unique values for variable x ({0:d}) or y ({1:d})".format(
hist2d.shape[0], hist2d.shape[1]
)
)
return False
if hist2d.shape[0] > 1000:
warnings.warn(
"The number of unique values of variable x is large: {0:d}. "
"Are you sure this is not an interval variable? Analysis might be slow.".format(
hist2d.shape[0]
)
)
if hist2d.shape[1] > 1000:
warnings.warn(
"The number of unique values of variable y is large: {0:d}. "
"Are you sure this is not an interval variable? Analysis might be slow.".format(
hist2d.shape[0]
)
)
return True
================================================
FILE: phik/decorators/__init__.py
================================================
# flake8: noqa
# import pandas DataFrame decorators
from phik.decorators import pandas
================================================
FILE: phik/decorators/pandas.py
================================================
"""Project: PhiK - correlation analyzer library
Module: phik.decorators.pandas
Created: 2018/11/14
Description:
Decorators for pandas DataFrame objects
Authors:
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
"""
from pandas import DataFrame, Series
# add function to create a 2d histogram
from phik.binning import hist2d, hist2d_from_array
DataFrame.hist2d = hist2d
Series.hist2d = hist2d_from_array
# add phik correlation matrix function
from phik.phik import phik_matrix, global_phik_array
DataFrame.phik_matrix = phik_matrix
DataFrame.global_phik = global_phik_array
# add significance matrix function for variable dependencies
from phik.significance import significance_matrix
DataFrame.significance_matrix = significance_matrix
# outlier matrix
from phik.outliers import outlier_significance_matrices, outlier_significance_matrix, outlier_significance_from_array
DataFrame.outlier_significance_matrices = outlier_significance_matrices
DataFrame.outlier_significance_matrix = outlier_significance_matrix
Series.outlier_significance_matrix = outlier_significance_from_array
================================================
FILE: phik/definitions.py
================================================
"""Project: PhiK - correlation analyzer library
Created: 2018/09/05
Description:
Definitions used throughout the phik package
Authors:
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
"""
# names assigned to underflow and overflow bin when assigning bin indices
OF = "OF"
UF = "UF"
# name replacement of np.nan
NaN = "NaN"
================================================
FILE: phik/entry_points.py
================================================
"""Project: PhiK - correlation analyzer library
Created: 2018/11/13
Description:
Collection of phik entry points
Authors:
KPMG Advanced Analytics & Big Data team, Amstelveen, The Netherlands
Redistribution and use in source and binary forms, with or without
modification, are permitted according to the terms listed in the file
LICENSE.
"""
def phik_trial():
"""Run Phi_K tests.
We will keep this here until we've completed switch to pytest or nose and tox.
We could also keep it, but I don't like the fact that packages etc. are
hard coded. Gotta come up with
a better solution.
"""
import sys
import pytest
# ['--pylint'] +
# -r xs shows extra info on skips and xfails.
default_options = ["-rxs"]
args = sys.argv[1:] + default_options
sys.exit(pytest.main(args))
================================================
FILE: phik/notebooks/phik_tutorial_advanced.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Phi_K advanced tutorial\n",
"\n",
"This notebook guides you through the more advanced functionality of the phik package. This notebook will not cover all the underlying theory, but will just attempt to give an overview of all the options that are available. For a theoretical description the user is referred to our paper.\n",
"\n",
"The package offers functionality on three related topics:\n",
"\n",
"1. Phik correlation matrix\n",
"2. Significance matrix\n",
"3. Outlier significance matrix"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"# install phik (if not installed yet)\n",
"import sys\n",
"\n",
"!\"{sys.executable}\" -m pip install phik"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# import standard packages\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import itertools\n",
"\n",
"import phik\n",
"\n",
"from phik import resources\n",
"from phik.binning import bin_data\n",
"from phik.decorators import *\n",
"from phik.report import plot_correlation_matrix\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# if one changes something in the phik-package one can automatically reload the package or module\n",
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data\n",
"\n",
"A simulated dataset is part of the phik-package. The dataset concerns car insurance data. Load the dataset here:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car_color</th>\n",
" <th>driver_age</th>\n",
" <th>area</th>\n",
" <th>mileage</th>\n",
" <th>car_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>black</td>\n",
" <td>26.377219</td>\n",
" <td>suburbs</td>\n",
" <td>156806.288398</td>\n",
" <td>XXL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>black</td>\n",
" <td>58.976840</td>\n",
" <td>suburbs</td>\n",
" <td>74400.323559</td>\n",
" <td>XL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>multicolor</td>\n",
" <td>55.744988</td>\n",
" <td>downtown</td>\n",
" <td>267856.748015</td>\n",
" <td>XXL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>metalic</td>\n",
" <td>57.629139</td>\n",
" <td>downtown</td>\n",
" <td>259028.249060</td>\n",
" <td>XXL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>green</td>\n",
" <td>21.490637</td>\n",
" <td>downtown</td>\n",
" <td>110712.216080</td>\n",
" <td>XL</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" car_color driver_age area mileage car_size\n",
"0 black 26.377219 suburbs 156806.288398 XXL\n",
"1 black 58.976840 suburbs 74400.323559 XL\n",
"2 multicolor 55.744988 downtown 267856.748015 XXL\n",
"3 metalic 57.629139 downtown 259028.249060 XXL\n",
"4 green 21.490637 downtown 110712.216080 XL"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specify bin types\n",
"\n",
"The phik-package offers a way to calculate correlations between variables of mixed types. Variable types can be inferred automatically although we recommend to variable types to be specified by the user. \n",
"\n",
"Because interval type variables need to be binned in order to calculate phik and the significance, a list of interval variables is created."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['driver_age', 'mileage']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_types = {'severity': 'interval',\n",
" 'driver_age':'interval',\n",
" 'satisfaction':'ordinal',\n",
" 'mileage':'interval',\n",
" 'car_size':'ordinal',\n",
" 'car_use':'ordinal',\n",
" 'car_color':'categorical',\n",
" 'area':'categorical'}\n",
"\n",
"interval_cols = [col for col, v in data_types.items() if v=='interval' and col in data.columns]\n",
"interval_cols\n",
"# interval_cols is used below"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Phik correlation matrix\n",
"\n",
"Now let's start calculating the correlation phik between pairs of variables. \n",
"\n",
"Note that the original dataset is used as input, the binning of interval variables is done automatically."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car_color</th>\n",
" <th>driver_age</th>\n",
" <th>area</th>\n",
" <th>mileage</th>\n",
" <th>car_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>car_color</th>\n",
" <td>1.000000</td>\n",
" <td>0.389671</td>\n",
" <td>0.590456</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>driver_age</th>\n",
" <td>0.389671</td>\n",
" <td>1.000000</td>\n",
" <td>0.105506</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>area</th>\n",
" <td>0.590456</td>\n",
" <td>0.105506</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mileage</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.768589</td>\n",
" </tr>\n",
" <tr>\n",
" <th>car_size</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.768589</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" car_color driver_age area mileage car_size\n",
"car_color 1.000000 0.389671 0.590456 0.000000 0.000000\n",
"driver_age 0.389671 1.000000 0.105506 0.000000 0.000000\n",
"area 0.590456 0.105506 1.000000 0.000000 0.000000\n",
"mileage 0.000000 0.000000 0.000000 1.000000 0.768589\n",
"car_size 0.000000 0.000000 0.000000 0.768589 1.000000"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"phik_overview = data.phik_matrix(interval_cols=interval_cols)\n",
"phik_overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Specify binning per interval variable\n",
"\n",
"Binning can be set per interval variable individually. One can set the number of bins, or specify a list of bin edges. Note that the measured phik correlation is dependent on the chosen binning. \n",
"The default binning is uniform between the min and max values of the interval variable."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car_color</th>\n",
" <th>driver_age</th>\n",
" <th>area</th>\n",
" <th>mileage</th>\n",
" <th>car_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>car_color</th>\n",
" <td>1.000000</td>\n",
" <td>0.388350</td>\n",
" <td>0.590456</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>driver_age</th>\n",
" <td>0.388350</td>\n",
" <td>1.000000</td>\n",
" <td>0.071189</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>area</th>\n",
" <td>0.590456</td>\n",
" <td>0.071189</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mileage</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.665845</td>\n",
" </tr>\n",
" <tr>\n",
" <th>car_size</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.665845</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" car_color driver_age area mileage car_size\n",
"car_color 1.000000 0.388350 0.590456 0.000000 0.000000\n",
"driver_age 0.388350 1.000000 0.071189 0.000000 0.000000\n",
"area 0.590456 0.071189 1.000000 0.000000 0.000000\n",
"mileage 0.000000 0.000000 0.000000 1.000000 0.665845\n",
"car_size 0.000000 0.000000 0.000000 0.665845 1.000000"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}\n",
"phik_overview = data.phik_matrix(interval_cols=interval_cols, bins=bins)\n",
"phik_overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Do not apply noise correction\n",
"\n",
"For low statistics samples often a correlation larger than zero is measured when no correlation is actually present in the true underlying distribution. This is not only the case for phik, but also for the pearson correlation and Cramer's phi (see figure 4 in <font color='red'> XX </font>). In the phik calculation a noise correction is applied by default, to take into account erroneous correlation values as a result of low statistics. To switch off this noise cancellation (not recommended), do:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car_color</th>\n",
" <th>driver_age</th>\n",
" <th>area</th>\n",
" <th>mileage</th>\n",
" <th>car_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>car_color</th>\n",
" <td>1.000000</td>\n",
" <td>0.407860</td>\n",
" <td>0.594172</td>\n",
" <td>0.136267</td>\n",
" <td>0.096629</td>\n",
" </tr>\n",
" <tr>\n",
" <th>driver_age</th>\n",
" <td>0.407860</td>\n",
" <td>1.000000</td>\n",
" <td>0.190390</td>\n",
" <td>0.199606</td>\n",
" <td>0.121585</td>\n",
" </tr>\n",
" <tr>\n",
" <th>area</th>\n",
" <td>0.594172</td>\n",
" <td>0.190390</td>\n",
" <td>1.000000</td>\n",
" <td>0.149679</td>\n",
" <td>0.067452</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mileage</th>\n",
" <td>0.136267</td>\n",
" <td>0.199606</td>\n",
" <td>0.149679</td>\n",
" <td>1.000000</td>\n",
" <td>0.770836</td>\n",
" </tr>\n",
" <tr>\n",
" <th>car_size</th>\n",
" <td>0.096629</td>\n",
" <td>0.121585</td>\n",
" <td>0.067452</td>\n",
" <td>0.770836</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" car_color driver_age area mileage car_size\n",
"car_color 1.000000 0.407860 0.594172 0.136267 0.096629\n",
"driver_age 0.407860 1.000000 0.190390 0.199606 0.121585\n",
"area 0.594172 0.190390 1.000000 0.149679 0.067452\n",
"mileage 0.136267 0.199606 0.149679 1.000000 0.770836\n",
"car_size 0.096629 0.121585 0.067452 0.770836 1.000000"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"phik_overview = data.phik_matrix(interval_cols=interval_cols, noise_correction=False)\n",
"phik_overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using a different expectation histogram\n",
"\n",
"By default phik compares the 2d distribution of two (binned) variables with the distribution that assumes no dependency between them. One can also change the expected distribution though. Phi_K is calculated in the same way, but using the other expectation distribution. "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from phik.binning import auto_bin_data\n",
"from phik.phik import phik_observed_vs_expected_from_rebinned_df, phik_from_hist2d\n",
"from phik.statistics import get_dependent_frequency_estimates"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# get observed 2d histogram of two variables\n",
"cols = [\"mileage\", \"car_size\"]\n",
"icols = [\"mileage\"]\n",
"observed = data[cols].hist2d(interval_cols=icols).values"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.768588829489185\n"
]
}
],
"source": [
"# default phik evaluation from observed distribution\n",
"phik_value = phik_from_hist2d(observed)\n",
"print (phik_value)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.768588829489185\n"
]
}
],
"source": [
"# phik evaluation from an observed and expected distribution\n",
"expected = get_dependent_frequency_estimates(observed)\n",
"phik_value = phik_from_hist2d(observed=observed, expected=expected)\n",
"print (phik_value)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# one can also compare two datasets against each other, and get a full phik matrix that way.\n",
"# this needs binned datasets though. \n",
"# (the user needs to make sure the binnings of both datasets are identical.) \n",
"data_binned, _ = auto_bin_data(data, interval_cols=interval_cols)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# here we are comparing data_binned against itself\n",
"phik_matrix = phik_observed_vs_expected_from_rebinned_df(data_binned, data_binned)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car_color</th>\n",
" <th>driver_age</th>\n",
" <th>area</th>\n",
" <th>mileage</th>\n",
" <th>car_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>car_color</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>driver_age</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>area</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mileage</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>car_size</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" car_color driver_age area mileage car_size\n",
"car_color 1.0 0.0 0.0 0.0 0.0\n",
"driver_age 0.0 1.0 0.0 0.0 0.0\n",
"area 0.0 0.0 1.0 0.0 0.0\n",
"mileage 0.0 0.0 0.0 1.0 0.0\n",
"car_size 0.0 0.0 0.0 0.0 1.0"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# all off-diagonal entries are zero, meaning the all 2d distributions of both datasets are identical.\n",
"# (by construction the diagonal is one.)\n",
"phik_matrix"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Statistical significance of the correlation\n",
"\n",
"When assessing correlations it is good practise to evaluate both the correlation and the significance of the correlation: a large correlation may be statistically insignificant, and vice versa a small correlation may be very significant. For instance, scipy.stats.pearsonr returns both the pearson correlation and the p-value. Similarly, the phik package offers functionality the calculate a significance matrix. Significance is defined as:\n",
"\n",
"$$Z = \\Phi^{-1}(1-p)\\ ;\\quad \\Phi(z)=\\frac{1}{\\sqrt{2\\pi}} \\int_{-\\infty}^{z} e^{-t^{2}/2}\\,{\\rm d}t $$\n",
"\n",
"Several corrections to the 'standard' p-value calculation are taken into account, making the method more robust for low statistics and sparse data cases. The user is referred to our paper for more details.\n",
"\n",
"Due to the corrections, the significance calculation can take a few seconds."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car_color</th>\n",
" <th>driver_age</th>\n",
" <th>area</th>\n",
" <th>mileage</th>\n",
" <th>car_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>car_color</th>\n",
" <td>85.498655</td>\n",
" <td>19.836720</td>\n",
" <td>37.623764</td>\n",
" <td>-0.559532</td>\n",
" <td>-0.483387</td>\n",
" </tr>\n",
" <tr>\n",
" <th>driver_age</th>\n",
" <td>19.836720</td>\n",
" <td>84.370542</td>\n",
" <td>1.852524</td>\n",
" <td>-0.572284</td>\n",
" <td>-0.459980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>area</th>\n",
" <td>37.623764</td>\n",
" <td>1.852524</td>\n",
" <td>72.415600</td>\n",
" <td>-0.560672</td>\n",
" <td>-0.273138</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mileage</th>\n",
" <td>-0.559532</td>\n",
" <td>-0.572284</td>\n",
" <td>-0.560672</td>\n",
" <td>91.262677</td>\n",
" <td>49.285368</td>\n",
" </tr>\n",
" <tr>\n",
" <th>car_size</th>\n",
" <td>-0.483387</td>\n",
" <td>-0.459980</td>\n",
" <td>-0.273138</td>\n",
" <td>49.285368</td>\n",
" <td>69.064056</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" car_color driver_age area mileage car_size\n",
"car_color 85.498655 19.836720 37.623764 -0.559532 -0.483387\n",
"driver_age 19.836720 84.370542 1.852524 -0.572284 -0.459980\n",
"area 37.623764 1.852524 72.415600 -0.560672 -0.273138\n",
"mileage -0.559532 -0.572284 -0.560672 91.262677 49.285368\n",
"car_size -0.483387 -0.459980 -0.273138 49.285368 69.064056"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"significance_overview = data.significance_matrix(interval_cols=interval_cols)\n",
"significance_overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Specify binning per interval variable\n",
"Binning can be set per interval variable individually. One can set the number of bins, or specify a list of bin edges. Note that the measure phik correlation is dependent on the chosen binning."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car_color</th>\n",
" <th>driver_age</th>\n",
" <th>area</th>\n",
" <th>mileage</th>\n",
" <th>car_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>car_color</th>\n",
" <td>85.480870</td>\n",
" <td>20.544400</td>\n",
" <td>37.613135</td>\n",
" <td>-0.214896</td>\n",
" <td>-0.447747</td>\n",
" </tr>\n",
" <tr>\n",
" <th>driver_age</th>\n",
" <td>20.544400</td>\n",
" <td>83.344168</td>\n",
" <td>2.478032</td>\n",
" <td>-0.563892</td>\n",
" <td>-0.534263</td>\n",
" </tr>\n",
" <tr>\n",
" <th>area</th>\n",
" <td>37.613135</td>\n",
" <td>2.478032</td>\n",
" <td>72.428355</td>\n",
" <td>-0.309349</td>\n",
" <td>-0.260994</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mileage</th>\n",
" <td>-0.214896</td>\n",
" <td>-0.563892</td>\n",
" <td>-0.309349</td>\n",
" <td>77.784086</td>\n",
" <td>47.010736</td>\n",
" </tr>\n",
" <tr>\n",
" <th>car_size</th>\n",
" <td>-0.447747</td>\n",
" <td>-0.534263</td>\n",
" <td>-0.260994</td>\n",
" <td>47.010736</td>\n",
" <td>69.081712</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" car_color driver_age area mileage car_size\n",
"car_color 85.480870 20.544400 37.613135 -0.214896 -0.447747\n",
"driver_age 20.544400 83.344168 2.478032 -0.563892 -0.534263\n",
"area 37.613135 2.478032 72.428355 -0.309349 -0.260994\n",
"mileage -0.214896 -0.563892 -0.309349 77.784086 47.010736\n",
"car_size -0.447747 -0.534263 -0.260994 47.010736 69.081712"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins = {'mileage':5, 'driver_age':[18,25,35,45,55,65,125]}\n",
"significance_overview = data.significance_matrix(interval_cols=interval_cols, bins=bins)\n",
"significance_overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Specify significance method\n",
"\n",
"The recommended method to calculate the significance of the correlation is a hybrid approach, which uses the G-test statistic. The number of degrees of freedom and an analytical, empirical description of the $\\chi^2$ distribution are sed, based on Monte Carlo simulations. This method works well for both high as low statistics samples.\n",
"\n",
"Other approaches to calculate the significance are implemented:\n",
"- asymptotic: fast, but over-estimates the number of degrees of freedom for low statistics samples, leading to erroneous values of the significance\n",
"- MC: Many simulated samples are needed to accurately measure significances larger than 3, making this method computationally expensive.\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car_color</th>\n",
" <th>driver_age</th>\n",
" <th>area</th>\n",
" <th>mileage</th>\n",
" <th>car_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>car_color</th>\n",
" <td>85.526574</td>\n",
" <td>19.681564</td>\n",
" <td>37.661844</td>\n",
" <td>-0.385023</td>\n",
" <td>-0.333340</td>\n",
" </tr>\n",
" <tr>\n",
" <th>driver_age</th>\n",
" <td>19.681564</td>\n",
" <td>84.014654</td>\n",
" <td>1.742050</td>\n",
" <td>-0.947153</td>\n",
" <td>-0.793434</td>\n",
" </tr>\n",
" <tr>\n",
" <th>area</th>\n",
" <td>37.661844</td>\n",
" <td>1.742050</td>\n",
" <td>72.440209</td>\n",
" <td>-0.465002</td>\n",
" <td>-0.123678</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mileage</th>\n",
" <td>-0.385023</td>\n",
" <td>-0.947153</td>\n",
" <td>-0.465002</td>\n",
" <td>91.301129</td>\n",
" <td>49.332305</td>\n",
" </tr>\n",
" <tr>\n",
" <th>car_size</th>\n",
" <td>-0.333340</td>\n",
" <td>-0.793434</td>\n",
" <td>-0.123678</td>\n",
" <td>49.332305</td>\n",
" <td>69.107448</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" car_color driver_age area mileage car_size\n",
"car_color 85.526574 19.681564 37.661844 -0.385023 -0.333340\n",
"driver_age 19.681564 84.014654 1.742050 -0.947153 -0.793434\n",
"area 37.661844 1.742050 72.440209 -0.465002 -0.123678\n",
"mileage -0.385023 -0.947153 -0.465002 91.301129 49.332305\n",
"car_size -0.333340 -0.793434 -0.123678 49.332305 69.107448"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"significance_overview = data.significance_matrix(interval_cols=interval_cols, significance_method='asymptotic')\n",
"significance_overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Simulation method\n",
"\n",
"The chi2 of a contingency table is measured using a comparison of the expected frequencies with the true frequencies in a contingency table. The expected frequencies can be simulated in a variety of ways. The following methods are implemented:\n",
"\n",
" - multinominal: Only the total number of records is fixed. (default)\n",
" - row_product_multinominal: The row totals fixed in the sampling.\n",
" - col_product_multinominal: The column totals fixed in the sampling.\n",
" - hypergeometric: Both the row or column totals are fixed in the sampling. (Note that this type of sampling is only available when row and column totals are integers, which is usually the case.)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# --- Warning, can be slow\n",
"# turned off here by default for unit testing purposes\n",
"\n",
"#significance_overview = data.significance_matrix(interval_cols=interval_cols, simulation_method='hypergeometric')\n",
"#significance_overview"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Expected frequencies"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from phik.simulation import sim_2d_data_patefield, sim_2d_product_multinominal, sim_2d_data"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>area</th>\n",
" <th>country_side</th>\n",
" <th>downtown</th>\n",
" <th>hills</th>\n",
" <th>suburbs</th>\n",
" <th>unpaved_roads</th>\n",
" </tr>\n",
" <tr>\n",
" <th>driver_age</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>11.0</td>\n",
" <td>86.0</td>\n",
" <td>123.0</td>\n",
" <td>147.0</td>\n",
" <td>21.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>9.0</td>\n",
" <td>77.0</td>\n",
" <td>137.0</td>\n",
" <td>125.0</td>\n",
" <td>31.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7.0</td>\n",
" <td>102.0</td>\n",
" <td>131.0</td>\n",
" <td>130.0</td>\n",
" <td>18.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>17.0</td>\n",
" <td>83.0</td>\n",
" <td>130.0</td>\n",
" <td>95.0</td>\n",
" <td>14.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>13.0</td>\n",
" <td>68.0</td>\n",
" <td>120.0</td>\n",
" <td>72.0</td>\n",
" <td>8.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7.0</td>\n",
" <td>30.0</td>\n",
" <td>51.0</td>\n",
" <td>47.0</td>\n",
" <td>9.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1.0</td>\n",
" <td>11.0</td>\n",
" <td>23.0</td>\n",
" <td>14.0</td>\n",
" <td>7.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>7.0</td>\n",
" <td>8.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"area country_side downtown hills suburbs unpaved_roads\n",
"driver_age \n",
"1 11.0 86.0 123.0 147.0 21.0\n",
"2 9.0 77.0 137.0 125.0 31.0\n",
"3 7.0 102.0 131.0 130.0 18.0\n",
"4 17.0 83.0 130.0 95.0 14.0\n",
"5 13.0 68.0 120.0 72.0 8.0\n",
"6 7.0 30.0 51.0 47.0 9.0\n",
"7 1.0 11.0 23.0 14.0 7.0\n",
"8 0.0 4.0 7.0 8.0 2.0\n",
"9 0.0 0.0 1.0 1.0 0.0\n",
"10 0.0 1.0 1.0 0.0 0.0"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inputdata = data[['driver_age', 'area']].hist2d(interval_cols=['driver_age'])\n",
"inputdata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Multinominal"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data total: 2000.0\n",
"sim total: 2000\n",
"data row totals: [ 65. 462. 724. 639. 110.]\n",
"sim row totals: [ 75 468 748 586 123]\n",
"data column totals: [388. 379. 388. 339. 281. 144. 56. 21. 2. 2.]\n",
"sim column totals: [378 380 375 335 281 164 59 25 1 2]\n"
]
}
],
"source": [
"simdata = sim_2d_data(inputdata.values)\n",
"print('data total:', inputdata.sum().sum())\n",
"print('sim total:', simdata.sum().sum())\n",
"print('data row totals:', inputdata.sum(axis=0).values)\n",
"print('sim row totals:', simdata.sum(axis=0))\n",
"print('data column totals:', inputdata.sum(axis=1).values)\n",
"print('sim column totals:', simdata.sum(axis=1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### product multinominal"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data total: 2000.0\n",
"sim total: 2000\n",
"data row totals: [ 65 462 724 639 110]\n",
"sim row totals: [ 65 462 724 639 110]\n",
"data column totals: [388 379 388 339 281 144 56 21 2 2]\n",
"sim column totals: [399 353 415 349 272 139 45 22 4 2]\n"
]
}
],
"source": [
"simdata = sim_2d_product_multinominal(inputdata.values, axis=0)\n",
"print('data total:', inputdata.sum().sum())\n",
"print('sim total:', simdata.sum().sum())\n",
"print('data row totals:', inputdata.sum(axis=0).astype(int).values)\n",
"print('sim row totals:', simdata.sum(axis=0).astype(int))\n",
"print('data column totals:', inputdata.sum(axis=1).astype(int).values)\n",
"print('sim column totals:', simdata.sum(axis=1).astype(int))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### hypergeometric (\"patefield\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data total: 2000.0\n",
"sim total: 2000\n",
"data row totals: [ 65 462 724 639 110]\n",
"sim row totals: [ 65 462 724 639 110]\n",
"data column totals: [388 379 388 339 281 144 56 21 2 2]\n",
"sim column totals: [388 379 388 339 281 144 56 21 2 2]\n"
]
}
],
"source": [
"# patefield simulation needs compiled c++ code.\n",
"# only run this if the python binding to the (compiled) patefiled simulation function is found.\n",
"try:\n",
" from phik.simcore import _sim_2d_data_patefield\n",
" CPP_SUPPORT = True\n",
"except ImportError:\n",
" CPP_SUPPORT = False\n",
"\n",
"if CPP_SUPPORT:\n",
" simdata = sim_2d_data_patefield(inputdata.values)\n",
" print('data total:', inputdata.sum().sum())\n",
" print('sim total:', simdata.sum().sum())\n",
" print('data row totals:', inputdata.sum(axis=0).astype(int).values)\n",
" print('sim row totals:', simdata.sum(axis=0))\n",
" print('data column totals:', inputdata.sum(axis=1).astype(int).values)\n",
" print('sim column totals:', simdata.sum(axis=1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Outlier significance\n",
"\n",
"The normal pearson correlation between two interval variables is easy to interpret. However, the phik correlation between two variables of mixed type is not always easy to interpret, especially when it concerns categorical variables. Therefore, functionality is provided to detect \"outliers\": excesses and deficits over the expected frequencies in the contingency table of two variables. \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example 1: mileage versus car_size"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For the categorical variable pair mileage - car_size we measured:\n",
"\n",
"$$\\phi_k = 0.77 \\, ,\\quad\\quad \\mathrm{significance} = 46.3$$\n",
"\n",
"Let's use the outlier significance functionality to gain a better understanding of this significance correlation between mileage and car size.\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"c0 = 'mileage'\n",
"c1 = 'car_size'\n",
"\n",
"tmp_interval_cols = ['mileage']"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>car_size</th>\n",
" <th>L</th>\n",
" <th>M</th>\n",
" <th>S</th>\n",
" <th>XL</th>\n",
" <th>XS</th>\n",
" <th>XXL</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>53.5_30047.0</th>\n",
" <td>6.882155</td>\n",
" <td>21.483476</td>\n",
" <td>18.076204</td>\n",
" <td>-8.209536</td>\n",
" <td>10.820863</td>\n",
" <td>-22.423985</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30047.0_60040.5</th>\n",
" <td>20.034528</td>\n",
" <td>-0.251737</td>\n",
" <td>-3.408409</td>\n",
" <td>2.534277</td>\n",
" <td>-1.973628</td>\n",
" <td>-8.209536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60040.5_90033.9</th>\n",
" <td>1.627610</td>\n",
" <td>-3.043497</td>\n",
" <td>-2.265809</td>\n",
" <td>10.215936</td>\n",
" <td>-1.246784</td>\n",
" <td>-8.209536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90033.9_120027.4</th>\n",
" <td>-3.711579</td>\n",
" <td>-3.827278</td>\n",
" <td>-2.885475</td>\n",
" <td>12.999048</td>\n",
" <td>-1.638288</td>\n",
" <td>-7.185622</td>\n",
" </tr>\n",
" <tr>\n",
" <th>120027.4_150020.9</th>\n",
" <td>-7.665861</td>\n",
" <td>-6.173001</td>\n",
" <td>-4.746762</td>\n",
" <td>9.629145</td>\n",
" <td>-2.841508</td>\n",
" <td>-0.504521</td>\n",
" </tr>\n",
" <tr>\n",
" <th>150020.9_180014.4</th>\n",
" <td>-7.533189</td>\n",
" <td>-6.063786</td>\n",
" <td>-4.660049</td>\n",
" <td>1.559370</td>\n",
" <td>-2.785049</td>\n",
" <td>6.765549</td>\n",
" </tr>\n",
" <tr>\n",
" <th>180014.4_210007.8</th>\n",
" <td>-5.541940</td>\n",
" <td>-4.425929</td>\n",
" <td>-3.360023</td>\n",
" <td>-4.802787</td>\n",
" <td>-1.942469</td>\n",
" <td>10.520540</td>\n",
" </tr>\n",
" <tr>\n",
" <th>210007.8_240001.3</th>\n",
" <td>-3.496905</td>\n",
" <td>-2.745103</td>\n",
" <td>-2.030802</td>\n",
" <td>-5.850529</td>\n",
" <td>-1.100873</td>\n",
" <td>8.723925</td>\n",
" </tr>\n",
" <tr>\n",
" <th>240001.3_269994.8</th>\n",
" <td>-5.275976</td>\n",
" <td>-4.207164</td>\n",
" <td>-3.186534</td>\n",
" <td>-8.616464</td>\n",
" <td>-1.830944</td>\n",
" <td>13.303101</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269994.8_299988.2</th>\n",
" <td>-8.014016</td>\n",
" <td>-6.458253</td>\n",
" <td>-4.973240</td>\n",
" <td>-12.868389</td>\n",
" <td>-2.989055</td>\n",
" <td>20.992824</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"car_size L M S XL XS \\\n",
"53.5_30047.0 6.882155 21.483476 18.076204 -8.209536 10.820863 \n",
"30047.0_60040.5 20.034528 -0.251737 -3.408409 2.534277 -1.973628 \n",
"60040.5_90033.9 1.627610 -3.043497 -2.265809 10.215936 -1.246784 \n",
"90033.9_120027.4 -3.711579 -3.827278 -2.885475 12.999048 -1.638288 \n",
"120027.4_150020.9 -7.665861 -6.173001 -4.746762 9.629145 -2.841508 \n",
"150020.9_180014.4 -7.533189 -6.063786 -4.660049 1.559370 -2.785049 \n",
"180014.4_210007.8 -5.541940 -4.425929 -3.360023 -4.802787 -1.942469 \n",
"210007.8_240001.3 -3.496905 -2.745103 -2.030802 -5.850529 -1.100873 \n",
"240001.3_269994.8 -5.275976 -4.207164 -3.186534 -8.616464 -1.830944 \n",
"269994.8_299988.2 -8.014016 -6.458253 -4.973240 -12.868389 -2.989055 \n",
"\n",
"car_size XXL \n",
"53.5_30047.0 -22.423985 \n",
"30047.0_60040.5 -8.209536 \n",
"60040.5_90033.9 -8.209536 \n",
"90033.9_120027.4 -7.185622 \n",
"120027.4_150020.9 -0.504521 \n",
"150020.9_180014.4 6.765549 \n",
"180014.4_210007.8 10.520540 \n",
"210007.8_240001.3 8.723925 \n",
"240001.3_269994.8 13.303101 \n",
"269994.8_299988.2 20.992824 "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"outlier_signifs, binning_dict = data[[c0,c1]].outlier_significance_matrix(interval_cols=tmp_interval_cols, \n",
" retbins=True)\n",
"outlier_signifs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Specify binning per interval variable\n",
"Binning can be set per interval variable individually. One can set the number of bins, or specify a list of bin edges. \n",
"\n",
"Note: in case a bin is created without any records this bin will be automatically dropped in the phik and (outlier) significance calculations. However, in the outlier significance calculation this will currently lead to an error as the number of provided bin edges does not match the number of bins anymore."
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>car_size</th>\n",
" <th>L</th>\n",
" <th>M</th>\n",
" <th>S</th>\n",
" <th>XL</th>\n",
" <th>XS</th>\n",
" <th>XXL</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0.0_100.0</th>\n",
" <td>-0.223635</td>\n",
" <td>-0.153005</td>\n",
" <td>-0.096640</td>\n",
" <td>-0.504167</td>\n",
" <td>2.150837</td>\n",
" <td>-1.337308</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100.0_1000.0</th>\n",
" <td>-0.742899</td>\n",
" <td>-0.533211</td>\n",
" <td>2.164954</td>\n",
" <td>-1.469996</td>\n",
" <td>5.704340</td>\n",
" <td>-3.272689</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1000.0_10000.0</th>\n",
" <td>-3.489668</td>\n",
" <td>3.499856</td>\n",
" <td>18.061724</td>\n",
" <td>-6.831062</td>\n",
" <td>11.617394</td>\n",
" <td>-13.063085</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10000.0_100000.0</th>\n",
" <td>25.086723</td>\n",
" <td>15.956527</td>\n",
" <td>-0.251877</td>\n",
" <td>5.162309</td>\n",
" <td>-3.896807</td>\n",
" <td>-8.209536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100000.0_1000000.0</th>\n",
" <td>-8.209536</td>\n",
" <td>-17.223164</td>\n",
" <td>-13.626621</td>\n",
" <td>-2.140870</td>\n",
" <td>-8.688844</td>\n",
" <td>44.933133</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"car_size L M S XL XS \\\n",
"0.0_100.0 -0.223635 -0.153005 -0.096640 -0.504167 2.150837 \n",
"100.0_1000.0 -0.742899 -0.533211 2.164954 -1.469996 5.704340 \n",
"1000.0_10000.0 -3.489668 3.499856 18.061724 -6.831062 11.617394 \n",
"10000.0_100000.0 25.086723 15.956527 -0.251877 5.162309 -3.896807 \n",
"100000.0_1000000.0 -8.209536 -17.223164 -13.626621 -2.140870 -8.688844 \n",
"\n",
"car_size XXL \n",
"0.0_100.0 -1.337308 \n",
"100.0_1000.0 -3.272689 \n",
"1000.0_10000.0 -13.063085 \n",
"10000.0_100000.0 -8.209536 \n",
"100000.0_1000000.0 44.933133 "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins = [0,1E2, 1E3, 1E4, 1E5, 1E6]\n",
"outlier_signifs, binning_dict = data[[c0,c1]].outlier_significance_matrix(interval_cols=tmp_interval_cols, \n",
" bins=bins, retbins=True)\n",
"outlier_signifs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Specify binning per interval variable -- dealing with underflow and overflow\n",
"\n",
"When specifying custom bins as situation can occur when the minimal (maximum) value in the data is smaller (larger) than the minimum (maximum) bin edge. Data points outside the specified range will be collected in the underflow (UF) and overflow (OF) bins. One can choose how to deal with these under/overflow bins, by setting the drop_underflow and drop_overflow variables.\n",
"\n",
"Note that the drop_underflow and drop_overflow options are also available for the calculation of the phik matrix and the significance matrix."
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>car_size</th>\n",
" <th>L</th>\n",
" <th>M</th>\n",
" <th>S</th>\n",
" <th>XL</th>\n",
" <th>XS</th>\n",
" <th>XXL</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>100.0_1000.0</th>\n",
" <td>-0.742899</td>\n",
" <td>-0.533211</td>\n",
" <td>2.164954</td>\n",
" <td>-1.469996</td>\n",
" <td>5.704340</td>\n",
" <td>-3.272689</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1000.0_10000.0</th>\n",
" <td>-3.489668</td>\n",
" <td>3.499856</td>\n",
" <td>18.061724</td>\n",
" <td>-6.831062</td>\n",
" <td>11.617394</td>\n",
" <td>-13.063085</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10000.0_100000.0</th>\n",
" <td>25.086723</td>\n",
" <td>15.956527</td>\n",
" <td>-0.251877</td>\n",
" <td>5.162309</td>\n",
" <td>-3.896807</td>\n",
" <td>-8.209536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>OF</th>\n",
" <td>-8.209536</td>\n",
" <td>-17.223164</td>\n",
" <td>-13.626621</td>\n",
" <td>-2.140870</td>\n",
" <td>-8.688844</td>\n",
" <td>44.933133</td>\n",
" </tr>\n",
" <tr>\n",
" <th>UF</th>\n",
" <td>-0.223635</td>\n",
" <td>-0.153005</td>\n",
" <td>-0.096640</td>\n",
" <td>-0.504167</td>\n",
" <td>2.150837</td>\n",
" <td>-1.337308</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"car_size L M S XL XS \\\n",
"100.0_1000.0 -0.742899 -0.533211 2.164954 -1.469996 5.704340 \n",
"1000.0_10000.0 -3.489668 3.499856 18.061724 -6.831062 11.617394 \n",
"10000.0_100000.0 25.086723 15.956527 -0.251877 5.162309 -3.896807 \n",
"OF -8.209536 -17.223164 -13.626621 -2.140870 -8.688844 \n",
"UF -0.223635 -0.153005 -0.096640 -0.504167 2.150837 \n",
"\n",
"car_size XXL \n",
"100.0_1000.0 -3.272689 \n",
"1000.0_10000.0 -13.063085 \n",
"10000.0_100000.0 -8.209536 \n",
"OF 44.933133 \n",
"UF -1.337308 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins = [1E2, 1E3, 1E4, 1E5]\n",
"outlier_signifs, binning_dict = data[[c0,c1]].outlier_significance_matrix(interval_cols=tmp_interval_cols, \n",
" bins=bins, retbins=True, \n",
" drop_underflow=False,\n",
" drop_overflow=False)\n",
"outlier_signifs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dealing with NaN's in the data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's add some missing values to our data"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"data.loc[np.random.choice(range(len(data)), size=10), 'car_size'] = np.nan\n",
"data.loc[np.random.choice(range(len(data)), size=10), 'mileage'] = np.nan"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sometimes there can be information in the missing values and in which case you might want to consider the NaN values as a separate category. This can be achieved by setting the dropna argument to False."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>car_size</th>\n",
" <th>L</th>\n",
" <th>M</th>\n",
" <th>NaN</th>\n",
" <th>S</th>\n",
" <th>XL</th>\n",
" <th>XS</th>\n",
" <th>XXL</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>100.0_1000.0</th>\n",
" <td>-0.742899</td>\n",
" <td>-0.533211</td>\n",
" <td>-0.053620</td>\n",
" <td>2.185319</td>\n",
" <td>-1.467322</td>\n",
" <td>5.704340</td>\n",
" <td>-3.254118</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1000.0_10000.0</th>\n",
" <td>-3.489668</td>\n",
" <td>3.499856</td>\n",
" <td>1.632438</td>\n",
" <td>17.591610</td>\n",
" <td>-6.821511</td>\n",
" <td>11.617394</td>\n",
" <td>-13.000691</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10000.0_100000.0</th>\n",
" <td>24.909164</td>\n",
" <td>15.798682</td>\n",
" <td>-1.078812</td>\n",
" <td>-0.081242</td>\n",
" <td>4.943028</td>\n",
" <td>-3.875525</td>\n",
" <td>-8.209536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>NaN</th>\n",
" <td>0.132649</td>\n",
" <td>0.488424</td>\n",
" <td>-0.073439</td>\n",
" <td>-0.455333</td>\n",
" <td>-0.132365</td>\n",
" <td>-0.211155</td>\n",
" <td>-0.012896</td>\n",
" </tr>\n",
" <tr>\n",
" <th>OF</th>\n",
" <td>-8.209536</td>\n",
" <td>-17.158980</td>\n",
" <td>-0.283391</td>\n",
" <td>-13.396642</td>\n",
" <td>-1.909226</td>\n",
" <td>-8.651800</td>\n",
" <td>43.560131</td>\n",
" </tr>\n",
" <tr>\n",
" <th>UF</th>\n",
" <td>-0.223635</td>\n",
" <td>-0.153005</td>\n",
" <td>-0.013130</td>\n",
" <td>-0.094218</td>\n",
" <td>-0.503051</td>\n",
" <td>2.150837</td>\n",
" <td>-1.328194</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"car_size L M NaN S XL \\\n",
"100.0_1000.0 -0.742899 -0.533211 -0.053620 2.185319 -1.467322 \n",
"1000.0_10000.0 -3.489668 3.499856 1.632438 17.591610 -6.821511 \n",
"10000.0_100000.0 24.909164 15.798682 -1.078812 -0.081242 4.943028 \n",
"NaN 0.132649 0.488424 -0.073439 -0.455333 -0.132365 \n",
"OF -8.209536 -17.158980 -0.283391 -13.396642 -1.909226 \n",
"UF -0.223635 -0.153005 -0.013130 -0.094218 -0.503051 \n",
"\n",
"car_size XS XXL \n",
"100.0_1000.0 5.704340 -3.254118 \n",
"1000.0_10000.0 11.617394 -13.000691 \n",
"10000.0_100000.0 -3.875525 -8.209536 \n",
"NaN -0.211155 -0.012896 \n",
"OF -8.651800 43.560131 \n",
"UF 2.150837 -1.328194 "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins = [1E2, 1E3, 1E4, 1E5]\n",
"outlier_signifs, binning_dict = data[[c0,c1]].outlier_significance_matrix(interval_cols=tmp_interval_cols, \n",
" bins=bins, retbins=True, \n",
" drop_underflow=False,\n",
" drop_overflow=False,\n",
" dropna=False)\n",
"outlier_signifs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here OF and UF are the underflow and overflow bin of car_size, respectively.\n",
"\n",
"To just ignore records with missing values set dropna to True (default)."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>car_size</th>\n",
" <th>L</th>\n",
" <th>M</th>\n",
" <th>S</th>\n",
" <th>XL</th>\n",
" <th>XS</th>\n",
" <th>XXL</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>100.0_1000.0</th>\n",
" <td>-0.745805</td>\n",
" <td>-0.534179</td>\n",
" <td>2.177522</td>\n",
" <td>-1.473602</td>\n",
" <td>5.695755</td>\n",
" <td>-3.268662</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1000.0_10000.0</th>\n",
" <td>-3.451793</td>\n",
" <td>3.559705</td>\n",
" <td>17.674546</td>\n",
" <td>-6.770807</td>\n",
" <td>11.651568</td>\n",
" <td>-12.916946</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10000.0_100000.0</th>\n",
" <td>25.035896</td>\n",
" <td>15.868135</td>\n",
" <td>-0.121191</td>\n",
" <td>4.904070</td>\n",
" <td>-3.896177</td>\n",
" <td>-8.209536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>OF</th>\n",
" <td>-8.209536</td>\n",
" <td>-17.164792</td>\n",
" <td>-13.459625</td>\n",
" <td>-1.934622</td>\n",
" <td>-8.695547</td>\n",
" <td>44.449479</td>\n",
" </tr>\n",
" <tr>\n",
" <th>UF</th>\n",
" <td>-0.224643</td>\n",
" <td>-0.153312</td>\n",
" <td>-0.095154</td>\n",
" <td>-0.505661</td>\n",
" <td>2.146765</td>\n",
" <td>-1.335316</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"car_size L M S XL XS \\\n",
"100.0_1000.0 -0.745805 -0.534179 2.177522 -1.473602 5.695755 \n",
"1000.0_10000.0 -3.451793 3.559705 17.674546 -6.770807 11.651568 \n",
"10000.0_100000.0 25.035896 15.868135 -0.121191 4.904070 -3.896177 \n",
"OF -8.209536 -17.164792 -13.459625 -1.934622 -8.695547 \n",
"UF -0.224643 -0.153312 -0.095154 -0.505661 2.146765 \n",
"\n",
"car_size XXL \n",
"100.0_1000.0 -3.268662 \n",
"1000.0_10000.0 -12.916946 \n",
"10000.0_100000.0 -8.209536 \n",
"OF 44.449479 \n",
"UF -1.335316 "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins = [1E2, 1E3, 1E4, 1E5]\n",
"outlier_signifs, binning_dict = data[[c0,c1]].outlier_significance_matrix(interval_cols=tmp_interval_cols, \n",
" bins=bins, retbins=True, \n",
" drop_underflow=False,\n",
" drop_overflow=False,\n",
" dropna=True)\n",
"outlier_signifs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that the dropna option is also available for the calculation of the phik matrix and the significance matrix."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: phik/notebooks/phik_tutorial_basic.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Phi_K basic tutorial\n",
"\n",
"This notebook guides you through the basic functionality of the phik package. The package offers functionality on three related topics:\n",
"\n",
"1. Phik correlation matrix\n",
"2. Significance matrix\n",
"3. Outlier significance matrix\n",
"\n",
"For more information on the underlying theory, the user is referred to our paper."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"# install phik (if not installed yet)\n",
"import sys\n",
"\n",
"!\"{sys.executable}\" -m pip install phik"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# import standard packages\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import itertools\n",
"\n",
"import phik\n",
"\n",
"from phik import resources\n",
"from phik.binning import bin_data\n",
"from phik.report import plot_correlation_matrix\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# if one changes something in the phik-package one can automatically reload the package or module\n",
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data\n",
"\n",
"A simulated dataset is part of the phik-package. The dataset concerns fake car insurance data. Load the dataset here:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv( resources.fixture('fake_insurance_data.csv.gz') )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Take a first look at the data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's use a simple data.head() to get an idea of what the data looks like and inspect the different types of variables."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car_color</th>\n",
" <th>driver_age</th>\n",
" <th>area</th>\n",
" <th>mileage</th>\n",
" <th>car_size</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>black</td>\n",
" <td>26.377219</td>\n",
" <td>suburbs</td>\n",
" <td>156806.288398</td>\n",
" <td>XXL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>black</td>\n",
" <td>58.976840</td>\n",
" <td>suburbs</td>\n",
" <td>74400.323559</td>\n",
" <td>XL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>multicolor</td>\n",
" <td>55.744988</td>\n",
" <td>downtown</td>\n",
" <td>267856.748015</td>\n",
" <td>XXL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>metalic</td>\n",
" <td>57.629139</td>\n",
" <td>downtown</td>\n",
" <td>259028.249060</td>\n",
" <td>XXL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>green</td>\n",
" <td>21.490637</td>\n",
" <td>downtown</td>\n",
" <td>110712.216080</td>\n",
" <td>XL</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" car_color driver_age area mileage car_size\n",
"0 black 26.377219 suburbs 156806.288398 XXL\n",
"1 black 58.976840 suburbs 74400.323559 XL\n",
"2 multicolor 55.744988 downtown 267856.748015 XXL\n",
"3 metalic 57.629139 downtown 259028.249060 XXL\n",
"4 green 21.490637 downtown 110712.216080 XL"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Specify bin types\n",
"\n",
"The phik-package offers a way to calculate correlations between variables of mixed types. Variable types can be inferred automatically although we recommend variable types to be specified by the user. \n",
"\n",
"Because interval type variables need to be binned in order to calculate phik and the significance, a list of interval variables is created."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['driver_age', 'mileage']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_types = {'severity': 'interval',\n",
" 'driver_age':'interval',\n",
" 'satisfaction':'ordinal',\n",
" 'mileage':'interval',\n",
" 'car_size':'ordinal',\n",
" 'car_use':'ordinal',\n",
" 'car_color':'categorical',\n",
" 'area':'categorical'}\n",
"\n",
"interval_cols = [col for col, v in data_types.items() if v=='interval' and col in data.columns]\n",
"interval_cols"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Visually inspect pairwise correlations"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Bin the interval variables\n",
"\n",
"To get a feeling for the data, let's bin the interval variables and create 2d histograms to inspect the correlations between variables. By binning the interval variables we can treat all variable types in the same way.\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# bin the interval variables\n",
"data_binned, binning_dict = bin_data(data, cols=interval_cols, retbins=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'driver_age': [(18.030700817879673, 28.34523864121504),\n",
" (28.34523864121504, 38.659776464550404),\n",
" (38.659776464550404, 48.974314287885775),\n",
" (48.974314287885775, 59.288852111221146),\n",
" (59.288852111221146, 69.60338993455652),\n",
" (69.60338993455652, 79.91792775789187),\n",
" (79.91792775789187, 90.23246558122725),\n",
" (90.23246558122725, 100.54700340456262),\n",
" (100.54700340456262, 110.86154122789799),\n",
" (110.86154122789799, 121.17607905123334)],\n",
" 'mileage': [(53.54305708442213, 30047.010932306846),\n",
" (30047.010932306846, 60040.478807529274),\n",
" (60040.478807529274, 90033.94668275169),\n",
" (90033.94668275169, 120027.41455797412),\n",
" (120027.41455797412, 150020.88243319656),\n",
" (150020.88243319656, 180014.35030841897),\n",
" (180014.35030841897, 210007.81818364142),\n",
" (210007.81818364142, 240001.28605886383),\n",
" (240001.28605886383, 269994.75393408624),\n",
" (269994.75393408624, 299988.2218093087)]}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"binning_dict"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABDAAAAR4CAYAAADwnyYXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOzdd7xcVbn/8c83VKnCJSoKGMqldw5NAghCSCwgTZrSvCByEaQ3qdKrWBAD0pQmoBJLIAEkNJGcBEJCVwgCF/kFpQtS8vz+WPvA5DAnOZnZe/aeOd/365VXZvbMPGvtc+Y8s/cza6+liMDMzMzMzMzMrMoGld0BMzMzMzMzM7NZcQHDzMzMzMzMzCrPBQwzMzMzMzMzqzwXMMzMzMzMzMys8lzAMDMzMzMzM7PKcwHDzMzMzMzMzCrPBQyzEkm6XNIpZffDzGygcf41MyuH8681wwUMszYg6Q5J/1N2P8zMBhrnXzOzcjj/Wj0uYJiZmZmZmZlZ5bmAYdZCktaSNFHS65KuA+bNti8i6feSpkl6Obu9RPbYqcDGwI8lvSHpx9n2CyQ9K+k1SRMkbVzajpmZVZzzr5lZOZx/LU8uYJi1iKS5gd8CvwAWBa4Hts8eHgRcBnwWWAp4C/gxQEQcC9wFHBARC0TEAdlrxgNrZrGuBq6XNG9r9sbMrH04/5qZlcP51/LmAoZZ62wAzAX8ICLejYgbSEmYiPhnRNwYEf+OiNeBU4FNZxYsIn6Zve69iDgXmAdYoeB9MDNrR86/ZmblcP61XLmAYdY6nwaej4io2fYMgKT5JP1M0jOSXgPuBD4uaY6+gkk6VNKjkl6V9AqwMLBYkTtgZtamnH/NzMrh/Gu5cgHDrHVeAD4jSTXblsr+P5RUPV4/IhYCNsm29zy3NumTXe93JPA1YJGI+Djwas3zzczsQ86/ZmblcP61XLmAYdY6fwbeAw6UNKek7YD1sscWJF3394qkRYETer32RWCZmvsLZrGmAXNKOh5YqMjOm5m1MedfM7NyOP9arlzAMGuRiHgH2A7YE3gZ2An4dfbwD4CPAS8B9wE393r5BcAO2QzNPwRuAUYDT5CG4b0NPFvwLpiZtSXnXzOzcjj/Wt404+VIZmZmZmZmZmbV4xEYZmZmZmZmZlZ5LmCYmZmZmZmZWeW5gGFmZmZmZmZmlecChpmZmZmZmZlV3pxld8Aas9hii8WQIUPK7oaZ5WDChAkvRcTgsvth/eP8a9Y5nH/bi/OvWedoNP+6gNGmhgwZwiITly4k9tjp1wOw5aAdC4nfijZ64g+ba+dC4o9591oARqx0dCHxAUY/ejoAm3/hjELi337bUQAMX3SfQuID3Pyvi4Hif8/tGr+nDUnPFNaA5a7I/Fu0Vryni9azD8MX3rvknjTu5lcvBWDEkINL7knjRk89H4Bhc+9ack8aN+adq51/28yQIUMY/Nx6hcQe/Y8LARjxqf0Ljd+K4+ui/i7HvHM1AMNXO7aQ+AA3Tz41tbHYvsXEf2kkUPzPqBN+z0X9LUD6e2g0//oSEjMzMzMzMzOrPBcwzMzMzMzMzKzyXMAwMzMzMzMzs8pzAcPMzMzMzMzMKs8FDDMzMzMzMzOrPBcwzMzMzMzMzKzyXMDImaQ3+ti+n6Tds9t7Svp0a3tmZtbZnH/NzMrh/GtmrTJn2R0YKCLiopq7ewJTgP8rpzdmZgOH86+ZWTmcf80sby5gzCZJRwBvR8QPJZ0PrBERm0v6ArBX9pxTgS8DbwHbRMSLkk4E3gCmAl3AVZLeAjYEVgbOAxYAXgL2jIgXWrtnZmbV5vxrZlYO518zqwpfQjL77gQ2zm53AQtImgsYCtwFzA/cFxFrZM/dp/bFEXED0A3sFhFrAu8BPwJ2iIh1gEuBU+s1LGlfSd2SuqdNm5b/npmZVZvzr5lZOZx/zawSXMCYfROAdSQtCPwH+DMpkW9MSuDvAL+vee6QWcRbAVgVGCvpQeB7wBL1nhgRIyOiKyK6Bg8e3Ox+mJm1G+dfM7NyOP+aWSX4EpLZFBHvSppKGi53L/AQsBmwLPAo8G5ERPb095n1z1jAwxGxYTE9NjPrDM6/ZmblcP41s6rwCIzG3Akclv1/F7Af8GBN4p6V14EFs9uPA4MlbQggaS5Jq+TcXzOzTuH8a2ZWDudfMyudCxiNuQtYHPhzRLwIvJ1t66/LgYuyIXNzADsAZ0qaBDwIfC7f7pqZdQznXzOzcjj/mlnpfAlJAyLiNmCumvvL19xeoOb2DcAN2e0Ta7bfCNxYE/JBYJPiemxm1hmcf83MyuH8a2ZV4BEYZmZmZmZmZlZ5LmCYmZmZmZmZWeW5gGFmZmZmZmZmlecChpmZmZmZmZlVnvq/8pFVSVdXV3R3d5fdDTPLgaQJEdFVdj+sf5x/zTqH8297cf416xyN5l+PwDAzMzMzMzOzyvMyqm1sy0E7FhJ37PTrC41f28ZW8+9eSPxb3rwSgE22PruQ+HeOOhyAlY85v5D4AI+cdjAAm37prELij/vDEQCMGLxfIfEBRk+7CGjf92or/xasvRT5nihSz/tt+EJ7ldyTxt382mUAbLnRKSX3pHFj7/keAMPXPL7knjTu5gdPBmDYXDuX3JPGjXn32rK7YA0YtsHJhcQdc1/6exy+2rGFxL958qkAjBhycCHxAUZPTcelI5Y4sJj4z/0wxV/p6ELiA4x+9HQAttj0tELi3zrumJbEH/Gp/QuJDzD6HxemNpY7vJj4f03nT0V+RvV8hjTCIzDMzMzMzMzMrPJcwDAzMzMzMzOzynMBw8zMzMzMzMwqzwUMMzMzMzMzM6s8FzDMzMzMzMzMrPJcwKgYSSdKOqzsfpiZDTTOv2Zm5XD+NbP+cgGjRZT4521m1mLOv2Zm5XD+NbO8OaEUSNIQSY9KuhCYCBwnabykhySdVPO8YyU9LulWYIXSOmxm1iGcf83MyuH8a2ZFmrPsDgwAKwB7Ab8FdgDWAwSMkrQJ8CawM7AW6fcxEZhQTlfNzDqK86+ZWTmcf82sEC5gFO+ZiLhP0jnAMOCBbPsCwH8DCwK/iYh/A0ga1VcgSfsC+wIstdRSLMLShXbczKzNOf+amZWjsPy76OKF9tvMKs6XkBTvzex/AadHxJrZv+Ui4ufZY9GfQBExMiK6IqJr8ODBhXTWzKyDOP+amZXD+dfMCuECRuvcAuwtaQEASZ+R9AngTmBbSR+TtCDwlTI7aWbWgZx/zczK4fxrZrnyJSQtEhFjJK0E/FkSwBvA1yNioqTrgAeBZ4C7SuymmVnHcf41MyuH86+Z5c0FjAJFxFRg1Zr7FwAX1HneqcCpreuZmVlnc/41MyuH86+ZFcmXkJiZmZmZmZlZ5bmAYWZmZmZmZmaV5wKGmZmZmZmZmVWeCxhmZmZmZmZmVnmK6NcSzFYxXV1d0d3dXXY3zCwHkiZERFfZ/bD+cf416xzOv+3F+desczSafz0Cw8zMzMzMzMwqz8uotrFhc+1cSNwx714LwIglDyokPsDoZ9NqWkO3P6eQ+HffeBgAK550fiHxHzvhYADWOLCY+ACTfpjaGL7KsYXEv/nhtHLZ8EX3KSQ+wM3/uhgo/r265aAdC4k/dvr1hcavbcPay1bz7lZ2Fxpyy9tXATBsvZNL7knjxtx/PAAb7XhuyT1p3D3XHwrA8MX2Lbknjbv5pZFAsfmxaM6/7WmTrc8uJO6dow4HYPhqBR13TU7HXSOWO7yQ+ACj/5p+NiNWOrqY+I+eDsCwDYr7DBlzX8rx6+59XiHxx196CABbdZ1YSPxbulPcEUscWEh8gNHP/TC1UfDvuRXnOY3wCAwzMzMzMzMzqzwXMMzMzMzMzMys8lzAMDMzMzMzM7PKcwHDzMzMzMzMzCrPBQwzMzMzMzMzqzwXMMzMzMzMzMys8lzAqEPSG9n/QyTtWrO9S9IPG4w5VdJiefXRzKwTOf+amZXD+dfM2oELGDM3BPgggUdEd0QUt6hvRol/N2Y2kA3B+dfMrAxDcP41s4rqyCSRVY4fk3SJpCmSrpK0haR7JD0paT1JJ0o6rOY1UyQN6RXqDGBjSQ9KOljS5yX9Pnv+ApIukzRZ0kOSts+275JtmyLpzD76d0j2+BRJ363p86OSLgQmAkvm/5MxMyuW86+ZWTmcf81sIOjIAkZmOeACYHVgRVIleShwGHBMP2McBdwVEWtGxPm9HjsOeDUiVouI1YHbJX0aOBPYHFgTWFfSV2tfJGkdYC9gfWADYB9Ja2UPrwBcGRFrRcQzvTsjaV9J3ZK6p02b1s9dMDNrOedfM7NyOP+aWUfr5ALG0xExOSKmAw8Dt0VEAJNJQ+OatQXwk547EfEysC5wR0RMi4j3gKuATXq9bijwm4h4MyLeAH4NbJw99kxE3NdXgxExMiK6IqJr8ODBOeyCmVkhnH/NzMrh/GtmHa2TCxj/qbk9veb+dGBO4D1m3P95ZzO+gKizrT+v68ubs9kHM7Mqcv41MyuH86+ZdbROLmDMylRgbQBJawNL13nO68CCfbx+DHBAzx1JiwB/ATaVtJikOYBdgHG9Xncn8FVJ80maH9gWuKuJ/TAzazdTcf41MyvDVJx/zayNDeQCxo3AopIeBL4NPFHnOQ8B70maJOngXo+dAiySTUQ0CdgsIl4Ajgb+BEwCJkbETbUvioiJwOXA/aSEf0lEPJDjfpmZVZ3zr5lZOZx/zaytzVl2B4oQEVOBVWvu79nHY8P6eP0C2f/vAl/o9fAd2WNvAHvUee3VwNV1tg+puX0ecN7M+mxm1o6cf83MyuH8a2YDwUAegWFmZmZmZmZmbcIFDDMzMzMzMzOrPBcwzMzMzMzMzKzyXMAwMzMzMzMzs8pTRO+lnK0ddHV1RXd3d9ndMLMcSJoQEV1l98P6x/nXrHM4/7YX51+zztFo/q30CAxJgyR9rux+mJkNNM6/ZmblcP41M+tbpZdRjYjpks4FNiy7L1U0YqWjC4k7+tHTAdhox3MLiQ9wz/WHArDqEecXEn/KWWnZ8hVPKib+Yyek+OvsU0x8gAkXpza2HHpqIfHH3n0sAMMX3aeQ+AA3/+tiAIbNtXMh8ce8ey0AWw7asZD4Y6dfDxTXf/hwH6rG+XfmRgw5uOwuNGT01JSzVj28uNxVtClnZ7lxo1NK7knjxt7zPaC43NUKPfmxE/ahapx/Z67oY8ctNj2tkPi3jjsGgOGrHVtIfICbJ6djxmEbnFxI/DH3HQ/A53Yq7hzh3uvSOcJKxxXze370++n3vNlWZxYS/0+3HAnAiOUOLyQ+wOi/ng0U9znY8xm19n7FHStMvKjx46hKj8DIjJG0vSSV3REzswHG+dfMrBzOv2ZmdVR6BEbmEGB+4H1JbwECIiIWKrdbZmYdz/nXzKwczr9mZnVUvoAREQuW3Qczs4HI+dfMrBzOv2Zm9VW+gAEgaWtgk+zuHRHx+zL7Y2Y2UDj/mpmVw/nXzOyjKj8HhqQzgIOAR7J/B2XbzMysQM6/ZmblcP41M6uv8gUM4IvAlhFxaURcCgzPtlWKpDUlzbJfkj4v6ffZ7a0lHVV878zMGuL8a2ZWjsrnX+deMytDOxQwAD5ec3vh0noxc2symx8sETEqIlxNN7Mqc/41MytH1fOvc6+ZtVw7FDBOBx6QdLmkK4AJQCELNEsaIukxSZdImiLpKklbSLpH0pOS1pM0v6RLJY2X9ICkbSTNDZwM7CTpQUk7Zc+9N3vOvZJWqNPenpJ+nN3+pKTfSJqU/ftcEftoZjYbnH/NzMrRkvzr3Gtm7abyk3hGxDWS7gDWJS0hdWRE/KPAJpcDdgT2BcYDuwJDga2BY0jXId4eEXtL+jhwP3ArcDzQFREHAEhaCNgkIt6TtAXpQ2f7mbT7Q2BcRGwraQ5ggd5PkLRv1i+WWmopBs+Xx+6amdXn/Puhj+Rf5bG7Zmb1tTj/Vjb3ZnFnyL9ensVsYKtsAUPS2r02PZf9/2lJn46IiQU1/XRETM768DBwW0SEpMnAEGAJYGtJh2XPnxdYqk6chYErJP03EMBcs2h3c2B3gIh4H3i19xMiYiQwEqCrqyt4czb3zMysH5x/+5F/X5rNPTMz64eS8m9lc2/22Az59+3Z2DEz6zyVLWAA587ksSAlvSL8p+b29Jr700k/r/eB7SPi8doXSVq/V5zvA3/KqspDgDuK6KyZWQGcf83MylFG/nXuNbO2UdkCRkRsVnYf+nAL8B1J38mq02tFxAPA6zDDqLaFgeez23v2I+5twLeBH2TD6OaPiNdy7LeZWb84/zr/mlk5Kpp/nXvNrDIqP4mnpLkkHSjphuzfAZJmNSStSN8nDYl7SNKU7D7An4CVeyYyAs4CTpd0DzBHP+IeBGyWDdebAKySf9fNzPrP+dfMrBwVy7/OvWZWGZUdgVHjp6SkeWF2/xvZtv/Ju6GImAqsWnN/zz4e+1ad1/6LNNFSreVrbh+XPe8OsiF1EXE5cHl2+0Vgm8Z7b2aWO+dfM7NytCT/OveaWbtphwLGuhGxRs392yVNKq03ZmYDh/OvmVk5nH/NzOqo/CUkwPuSlu25I2kZ0mRCZmZWLOdfM7NyOP+amdXRDiMwDgf+JOkp0jrYnwX2KrdLZmYDgvOvmVk5nH/NzOqofAEjIm7L1pNegZTAH4uI/8ziZWZm1iTnXzOzcjj/mpnVp4gouw8zJel/gasi4pXs/iLALhFx4cxf2dm6urqiu7u77G6YWQ4kTYiIrrL70Zvzb33Ov2adw/m3vTj/mnWORvNvO8yBsU9P8gaIiJeBfUrsj5nZQOH8a2ZWDudfM7M6Kn8JCTBIkiIbKiJpDmDukvtUCcudeX4hcf965MEArPD9YuIDPH5camPIz84pJP7Ubx0GwIonFbMPj52Q+r/aYcX9jCafk9oY8ZnvFBJ/9PM/SvGXPKiQ+ACjn70AgK3m3a2Q+Le8fRUAWw7asZD4Y6dfX2j82jYqyvm3DyOWPqTsLjRk9NPnAbD0j88tuSeNe/qAQwHYcJf23Yc/X5P2Ydjcu5bck8aNeedqoNj8WDTn3/a03NnnFRL3r4envL7u3sXEH39piv+FzU8vJD7AbbcfDcCWG51SSPyx93wPgCE/Leb4HWDqt7Nj+BMLOoY/MR1fb7L12YXEv3PU4QBs1XViIfEBbulOsTfetpjfw12/Sb+DIVeeUUh8gKm7H9Xwa9uhgHEL8CtJFwEB7AfcXG6XzMwGBOdfM7NyOP+amdXRDgWMI4F9gW+TJjEaA1xSao/MzAYG518zs3I4/5qZ1VH5AkZETAcuyv59hKQbI2L71vbKzKzzOf+amZXD+dfMrL52mMRzVpYpuwNmZgOU86+ZWTmcf81sQOqEAka114E1M+tczr9mZuVw/jWzAakTChhmZmZmZmZm1uE6oYChsjtgZjZAOf+amZXD+dfMBqRKFzAkzSHpl7N42pEt6Uw/SKr8pKhmZv3h/GtmVg7nXzOzvlU64UTE+5IGS5o7It7p4zljWtUfSccBuwHPAi8BE4AvA/cCGwGjJN0BnAcskD1nz4h4QdKywE+AwcC/gX0i4jFJlwOvAV3Ap4AjIuKGVu2TmVk9zr9mZuVw/jUz61ulCxiZqcA9kkYBb/ZsjIjzWtkJSV3A9sBapJ/bRFICB/h4RGwqaS5gHLBNREyTtBNwKrA3MBLYLyKelLQ+cCGwefb6xYGhwIrAKKBuApe0L2lNcJZaainmyn83zcxqTcX5t6cPM+TfwZUev2hmHWAqzr89ffDxr5l9oB0KGP+X/RsELFhiP4YCN0XEWwCSflfz2HXZ/ysAqwJjJQHMAbwgaQHgc8D12XaAeWpe/9tsve9HJH2yrw5ExEjSBwFdXV3xStO7ZGY2U86/md75l382vU9mZjPj/Jvx8a+Z1ap8ASMiTiq7D5mZTZb0Zs1zHo6IDWd4obQQ8EpErNnH6//Tz3bMzFrG+dfMrBzOv2Zm9VV+EGx2DeDZkv4o6faefyV05W7gK5LmzSrKX6rznMeBwZI2BJA0l6RVIuI14GlJO2bbJWmNlvXczKwBzr9mZuVw/jUzq6/yBQzgKuAxYGngJNI1geNb3YmIGE+6Pm8S8GugG3i113PeAXYAzpQ0CXiQNHQO0uRH38y2Pwxs06Kum5k1yvnXzKwczr9mZnVU/hIS4L8i4ueSDoqIccA4SeNK6ss5EXGipPmAO4FzI+Li2idExIPAJr1fGBFPA8PrbN+z1/0Fcu2xmVnjnH/NzMrh/GtmVkc7FDDezf5/QdKXSBMaLVFSX0ZKWhmYF7giIiaW1A8zs1Zw/jUzK4fzr5lZHe1QwDhF0sLAocCPgIWA75bRkYjYtYx2zcxK4vxrZlYO518zszraYQ6MHQFFxJSI2AzYEti25D6ZmQ0Ezr9mZuVw/jUzq0MRUXYfZkrSAxGx1qy2DTRdXV3R3d1ddjfMLAeSJkREV9n96M35tz7nX7PO4fzbXpx/zTpHo/m3HUZgDJK0SM8dSYvSHpe+mJm1O+dfM7NyOP+amdXRDonwXOBeSTcAAXwNOLXcLlXDjx/bvJC4B6yYlhlf5urTCokP8NSuxwDw2UvOLiT+M/9zOACrH3J+IfEfOu9gANbb47xC4gPcf8UhAAxf5dhC4t/8cPozGjF4v0LiA4yedhEAwxfdp5D4N/8rTYK+5aAdC4k/dvr1hcavbaOinH/7sNW8u5XdhYbc8vZVAKyzbzG5sRUmjEz5d/kbvl9yTxr3xA7HATBiucNL7knjRv81fX4XmR+L5vzbnpa9tpjj07/tnI5N19mnmPw44eKUu4avVsxxHcDNk9NbZKMdzikk/j03HAbAkCvPKCQ+wNTdjwJg6K1HFBL/7i3OAmDlY4r5PT9yWvo9j1jp6ELiA4x+9HQAPrfTuYXEv/e6Q4HW/J4bUfkCRkRcKakb2BwQsF1EPFJyt8zMOp7zr5lZOZx/zczqq3wBAyBL2E7aZmYt5vxrZlYO518zs49qhzkwzMzMzMzMzGyAcwHDzMzMzMzMzCrPBQwzMzMzMzMzqzwXMMzMzMzMzMys8lzAMDMzMzMzM7PKcwGjRSS1xYovZmadxvnXzKwczr9mljcnlZxIOg7YDXgWeAmYAHwZuBfYCBgl6Qnge8DcwD+z508DHgc+FxHTJA0CngA2iIiXWr4jZmZtxvnXzKwczr9m1moegZEDSV3A9sBawHZAV83DH4+ITSPiXOBuUmJeC7gWOCIipgO/JCVzgC2ASfWSt6R9JXVL6p42bVqBe2Rm1h6cf83MyuH8a2ZlcAEjH0OBmyLirYh4HfhdzWPX1dxeArhF0mTgcGCVbPulwO7Z7b2By+o1EhEjI6IrIroGDx6c6w6YmbUp518zs3I4/5pZy7mAkQ/N5LE3a27/CPhxRKwGfAuYFyAingVelLQ5sD4wuqiOmpl1GOdfM7NyOP+aWcu5gJGPu4GvSJpX0gLAl/p43sLA89ntPXo9dglpKN2vIuL9YrppZtZxnH/NzMrh/GtmLecCRg4iYjwwCpgE/BroBl6t89QTgesl3UWa6KjWKGAB+hg+Z2ZmH+X8a2ZWDudfMyuDVyHJzzkRcaKk+YA7gXMj4uLaJ0TETcBNfbx+DdLkRY8V3E8zs07j/GtmVg7nXzNrKRcw8jNS0sqk6/quiIiJ/X2hpKOAb/PhTMxmZtZ/zr9mZuVw/jWzlnIBIycRsWsTrz0DOCPH7piZDRjOv2Zm5XD+NbNW8xwYZmZmZmZmZlZ5LmCYmZmZmZmZWeUpIsrugzWgq6sruru7y+6GmeVA0oSI6Cq7H9Y/zr9mncP5t704/5p1jkbzr0dgmJmZmZmZmVnleRLPNjb52SUKibvaks8BMG7q8oXEB9h0yBMAbHvP/xYS/zcb/QSAZc89r5D4fzv0EADW3auY+ADjL0ttbLHpaYXEv3XcMQCM+Mx3CokPMPr5HwGw1bzFTDB+y9tXATBsrp0LiT/m3WsB2HLQjoXEBxg7/frCYltxinxPFKnn/TZi8WJybyuMfiHl9yFXnllyTxo3dfcjAVjzD8eV3JPGPfil7wMwYvB+JfekcaOnXVR2F6wBI+48qJC4oze5AIC19zu/kPgTLzoYgC9sfnoh8QFuu/1oAFb4fjH78PhxaR+OmFTcZ+BZa6TPqVVuOrGQ+A9vk+KuekQxP6MpZ6Wf0bANTi4kPsCY+44HYNnzCjrPOSSdg1z91/ULiQ+w63J/afi1HoFhZmZmZmZmZpXnAoaZmZmZmZmZVZ4LGGZmZmZmZmZWeS5gmJmZmZmZmVnluYBhZmZmZmZmZpXnAkZOJA2RNKXO9jskeX1xM7OCOP+amZXD+dfMWs0FDDMzMzMzMzOrPBcw8jWnpCskPSTpBknz1T4o6Y2a2ztIujy7PVjSjZLGZ/82anG/zczanfOvmVk5nH/NrGVcwMjXCsDIiFgdeA3Yv5+vuwA4PyLWBbYHLimof2Zmncr518ysHM6/ZtYyc5bdgQ7zbETck93+JXBgP1+3BbCypJ77C0laMCJer32SpH2BfQGWWmqpHLprZtYxWpp/F2HpHLpsZtYRWpp/B+fQYTNrXy5g5Ctm4/68NbcHARtGxFszDR4xEhgJ0NXVFfCPRvtpZtZpWpt/n220m2ZmHafFx79mNpD5EpJ8LSVpw+z2LsDdvR5/UdJKkgYB29ZsHwMc0HNH0prFdtPMrOM4/5qZlcP518xaxgWMfD0K7CHpIWBR4Ke9Hj8K+D1wO/BCzfYDga5s8qNHgP1a0Vkzsw7i/GtmVg7nXzNrGV9CkpOImAqsXOehz9c85wbghjqvfQnYqai+mZl1MudfM7NyOP+aWat5BIaZmZmZmZmZVZ4LGGZmZmZmZmZWeS5gmJmZmZmZmVnluYBhZmZmZmZmZpWnCC+n3I66urqiu7u77G6YWQ4kTYiIrrL7Yf3j/GvWOZx/24vzr1nnaDT/egSGmZmZmZmZmVWel1FtY/95YZlC4s6z+FMATP/H8oXEBxj0qScAuOmpNQuJv80yDwIw5GfnFBJ/6rcOA2CjHc8tJD7APdcfCsCw9U4uJP6Y+48HYMTyRxYSH2D0E2emNpY+pJj4T58HwLC5di4k/ph3ry00fm0b1l62HLRj2V1oyNjp15c9PGkAACAASURBVAPt23/4cB82G3ZmyT1p3J/GpLxb5Ods0Xo+xzfctbjPwaL9+epDy+6CNeDSJ4YWEnfv5e8GYMiVZxQSf+ruRwGwxaanFRIf4NZxxwCw/u7nFRL/L1em47lWnCPs9pd9Col/1foXA7DqEecXEn/KWQcDrfk9r3xMMfvwyGlpH6Y+t3gh8QGGLPFCw6/1CAwzMzMzMzMzqzwXMMzMzMzMzMys8lzAMDMzMzMzM7PKcwHDzMzMzMzMzCrPBQwzMzMzMzMzqzwXMMzMzMzMzMys8lzAaICkIZKm1Nl+h6SuBuLtKenH+fTOzKxzOf+amZXD+dfMqsAFDDMzMzMzMzOrPBcwGjenpCskPSTpBknz1T4o6aeSuiU9LOmkmu3rSrpX0iRJ90tasNfrviTpz5IWa9WOmJm1GedfM7NyOP+aWalcwGjcCsDIiFgdeA3Yv9fjx0ZEF7A6sKmk1SXNDVwHHBQRawBbAG/1vEDStsBRwBcj4qXeDUraN/tQ6J42bVoxe2VmVn3Ov2Zm5XD+NbNSuYDRuGcj4p7s9i+Bob0e/5qkicADwCrAyqSk/0JEjAeIiNci4r3s+ZsBRwJfioiX6zUYESMjoisiugYPHpzz7piZtQ3nXzOzcjj/mlmpXMBoXPR1X9LSwGHAF7IK9R+AeQHVeV2Pp4AFgeXz76qZWUdx/jUzK4fzr5mVygWMxi0lacPs9i7A3TWPLQS8Cbwq6ZPAiGz7Y8CnJa0LIGlBSXNmjz0DbAdcKWmVwntvZta+nH/NzMrh/GtmpXIBo3GPAntIeghYFPhpzwMRMYk0dO5h4FLgnmz7O8BOwI8kTQLGkirTPa97HNgNuF7Ssi3aDzOzduP8a2ZWDudfMyvVnLN+ivUWEVNJ1/T19vma5+zZx2vHAxv02nx59o+IeKCP2GZmA57zr5lZOZx/zawKPALDzMzMzMzMzCrPBQwzMzMzMzMzqzwXMMzMzMzMzMys8lzAMDMzMzMzM7PKU0RfyzJblXV1dUV3d3fZ3TCzHEiaEBFdZffD+sf516xzOP+2F+dfs87RaP51AaNNSZpGWju7vxYDXiqoO62I34o2vA/lx29FG1Xch89GxOCiOmP5aiD/zq5WvEeL5n0oX7v3H1qzD86/bcTHv45fUhveh2LiN5R/XcAYICR1F/kNQ9HxW9GG96H8+K1ooxP2wTpbJ7x/vA/la/f+Q2fsg5XLxxSdH78VbXgfyo9fy3NgmJmZmZmZmVnluYBhZmZmZmZmZpXnAsbAMbLN47eiDe9D+fFb0UYn7IN1tk54/3gfytfu/YfO2Acrl48pOj9+K9rwPpQf/wOeA8PMzMzMzMzMKs8jMMzMzMzMzMys8lzAMDMzMzMzM7PKcwHDzMzMzMzMzCrPBQwzM7MKkPQxSSuU3Q9rX5LmlzQou728pK0lzVV2v8yscZI+UXYfzKrEk3hawyRtBmwPLAm8BzwJXBIRf3X8D9rYCvgq8BkggP8DboqImx1/lm1PjojVim7HrAokfQU4B5g7IpaWtCZwckRsXXLX+k3S54AhwJw92yLiytI61ABJOwI3R8Trkr4HrA2cEhETS+5av0iaAGwMLALcB3QD/46I3Urt2GyStBHwYES8KenrpN/DBRHxTMldswqT9Gvg18BvI+KNAuIvDBxNOi4anG3+f8BNwBkR8UoObSzaexMwAViLdN72rybjrx4RDzUTw4ol6fiIOLnJGItFxEs1978OrAdMAS6OHAoA2Xv1ANK5wc+BY4ANgUeB0yLi5Wbb6LNtFzA6TyveUJLOAD4J3EZK5E8DTwD7Z21cP5DjZ238AFgeuBJ4Ltu8BLA78GREHDSQ42dtbNfXQ8BFETG4j8dzIWnLiBjbrvGtc2QnnpsDd0TEWtm2hyJi9XJ71j+SfgEsCzwIvJ9tjog4sLxezb6en7mkocDppKLSMRGxfsld6xdJEyNibUnfAT4WEWdJeqDnPdUuJD0ErAGsDvyCdCyzXURsWmrHrNIkPQ/8mZRLbwWuAf4QEe/kFP8W4Hbgioj4R7btU8AewBYRsWUObUwHehfqliAdh0VELNNk/PdJx7zXANdExCPNxGug/aZPzvuIe3tEbJ5jvPJOzqW/R8RSTcaYGBFrZ7e/RypsXw18GXguIg7OoZ9/BCYDCwErZbd/BWwJrBER2zTbRp9tu4DReVrxhqr9dlzSnMC4iNhI0iLAXRGx6kCOn8V9IiKWr7NdwBMR8d8DOX4W613gKtLojt52iIgFm21jFu03/SFRZnzrHJL+EhHr155stlkB41Fg5Ty+1SlTz89f0unA5Ii4up0KAJIeIBXizwe+GREPt+NotppCzPHA8xHx89oDcrN6av5+FyR9ObULsC7we9LJ+pgm4z8eEXUv85vZY7PZxmHAFsDhETE52/Z0RCzdbOws1gPAN0g/m52AN0nFjGsjYmoebcyi/TxOznuPIBHpC7fHAfL43Cz6XErSa309RCo+z9nH4/2NX3ssMRHYOBvRNhcwMY/PBEkPRsSa2XnBcxHxmd6PNdtGX5r64VhlfToivljzhvp8tv0uSQ/m1MZ0SYtmQ9k+DcwBEBEvZ+0O9PgAb0taLyLu77V9XeBtxwfgIeCciJjS+wFJW+TRgKRRfT0E/FfV49uAMUXSrsAckv4bOBC4t+Q+zY4pwKeAF8ruSJOel/Qz0gnEmZLmob3mC/suaYj7b7LixTLAn0ruUyNel3Q08HVgE0lzAJ7Lw2YlACLiddLInV9k36R/DTgKaKqAATwj6QjSCIwXASR9EtgTeLbJ2ABExDmSrgXOl/QscAL1v+RpoomYAhwLHCtpPWBn0jnCsxHxuWYbmNXJebPxganAa8ApwFtZ3LuAr+QQu0fR51KvAOv2vI9qZb/3Zn1M0lqkz685IuJNgIh4NxuFk4dB2Re/CwILSBoSEVMl/Rcwd05t1OUCRmdqxRvqNOABSY8DKwLfBpA0GJjk+ADsBVyYfRPQcwnGkqSku2cO8fcEftrG8SEdbPf1QbdtTm1sTDoI7n09rEjXA1Y9vg0M3yEdUP6H9G3YLcD3S+3R7FkMeETS/aR9AKCd5vDIfA0YTiqsviJpceDwkvvUbxExDhgnaSFJC0bEU6RiWLvZCdiVNIrkH5KWAs4uuU9WfR+Z9yL7ouqi7F+zdiIVQsbpw4k1XwRGkXJHLiLiOWBHpbmRxgLz5RWbdGxS29b9wP2SDgU2yamNQk/OI2JrSdsCI0m5epSkdyPfOXKKPpe6Evgs6f3T29U5xH8BOC+7/S9Ji0fEC1n/38shPqTLLB/Lbu8NXJJ9B7wScFJObdTlS0g6kKRdgB9kd/cnnZwHsDJwUkSMzKmdRYFlgL9GDhMXdVr8mnY+RZoEs6eK+w/Hbx1Jo4GzIuIj30JKujMimvrALjq+WTuQVHduguyEuvL00YnzZhBNTpzXKpK6gMtIB90inUjsHRETSu2YmTVE0seAZeuNVG0w3q4RkccJ8szaOAUYVWcEL5LOjIgjc2pnflKhfzlg7YhYIo+4WeyWnEu1WjaabZ6I+HeO8RQR7yldkr8m6bK/QkdjuoDRoVrxhlK2VFtETJc0N7AqMLWoAz1J+0fEhQXFXoB0/dxTeRUz1IKZnrNvpV7LvikcAnQBj0bEwzm20UXNSi0R8dgsXjK78Utb6cSsbJJ+x0yGB7fLCAZJe5PmD3qy7L40QtLTpN9DvUsII5qcOK9VsmvD/zci7sruDwUubKO5VF5n5n8PC7WwO9YBJJ0WEccUGP/KiNg955jrkfLOeEkrk0aFPRYRf8yznU4haQ1gw4jIY5RNbdxCz6Wyy1PWY8bj3/vznEuqBcfw9c5DHsur4NYXX0LSoSLifWVrv0fEe6Sl1D6yrE6jJH0V+BlpLon9SLPzvgksL+nbEfG7JuMf0nsTcLSkeQEi4ryPvmq24l8YEftnt4eShmv9DVhO0rdy+pB4IDsoLmSmZ0lHAd8C/iPpHOAw4B7gJEk/z+FntClwLukbvHWy2IsoTbz5jYhoehig+l7p5EBJIyKHlU7MKu6c7P/tSHNI/DK7vwvpOt92MQT4uqTPkpb8u4tU0Mhr3qVC5TVBXgW83lO8AIiIu7OiQFuIbOJmSScD/yDNYyBgN9KoErM+Sfph703AN7IvqYgmV0XSR+e8ErCZpI9n8ZsuOEs6ARgBzClpLLA+cAdwlKS1IuLUJuMvRJonZwlgdO1ojNpj4zxImisi3u21LZfzkFoRMYns8m9JK+Z4kj4dWE9SbYEhl1HIkoYBFwJPAs9nm5cgnYfsH81PONuKY/hCz0Nm2rZHYHQeSZuRPvTnAR4A9o1sZmHlNIu30izGI0iT8UwiXev2eHbwemNEdDUZ/3Xgj8DDfPiN2HfJhnNFRFPXVmnG5YX+BBwaEROVJjz7VbP9z+IWOtOzpIdJlc75SCc6y0TEtGxI3V+i+ZVaHgCGZTGXBs6LiG0lbUmaHXtYk7vQkpVOZtF+07PzS1qSdG32Z4DRwNk9H9iSfhsRX22+p9bp6l1u1I6XIGXDnfchHch8JiLmKLlL/SJppp+LETGxVX1pRE3/v0H6TLiGdMC9E/ByRBxbVt8aoWxVnlltM6sl6TnSyf4YPjx27DmxIiKuaDL+ROAR4BI+HLF1DWkSzFwumZM0mfRN/zykk+UlIuK1LLf+pdnRVJJuJJ0030eat+BdYNeI+E+O5wiFn4fMpO1cVn+bWYEByKPA8Cgwovf5QHa8/ceIWKnJ+K04hi/0PGRmPAKjM50FbBVpBvIdgLGSvhER91F/eGxD4sM1sP8eET1LFz3Tc2lJk1YhTT4zP+las39L2qPZwkUfFuo5OI2Ip7IhY3mIKHam5/cj4i1J75BmYf5n1uibymchlTkiYlp2+++kyYaIiLHZyIk8FL7SiaTt+nqI9I13sy4FbiQdDHyTNLnXVyLin2Q/M7N+GCxpmUiTLvYcxAwuuU/9prTO/EbAAqQD1sNIozDaxbkzeSyAzVvVkQb17v8JNbfb8Zuq9yXtBlxL6v8uQF4z51vnWok0J8Jw0kna85JOaLZwUaMLOIh0XHd4RDwo6a08Chc13ouI94F/S/pbRLwGkB3vTc8h/rIRsX12+7eSjgVul5Tn5YqFnofUGWnzwUPAx5uNn7kA2KKvAgPpvdaMOflw5HGt58lnxaVWHMMXfR7SJxcwOtPckc2BEBE3ZFW+X2dDffK8rmpQREwnVXB7ts1BDrPzRsTfgR0kbUNKfOc3G7OXFZWuFRYwRNIikZZQHUR+S7UVPdPzRElXk4o8twFXSLqZdKCdx+Uq3ZJ+nsXehvStBpLmI1t2Ngd7UvxKJ9cBV1H/vT9vDvEH11x3+R1JXwfuzA4G2vHEwcpxMHCHpKey+0NIQzPbxXaka2z/AIwD7ouIvJZbLlxEbFZ2H5oREZtln187RMSvyu5PDnYlnUBcQMqj92TbzPoUafnU70paB/ilpD+Q4zLI2THv+ZKuz/5/kfzPpd6RNF+kSRbX6dkoaWHSJQ3Nmqfm+J2IODUbuXInqQCdh6LPQ/YCDqVmxasau+QQH4ovMFwKjFdaMrfnco4lSV90/jyH+K04hi/6PKRPvoSkA0nqBr4cNatFSFoC+D2p8tr0daSS1gUm9z5AVZrAZWhE/LLe6xpsaz7Scjzr5zWcOrvUpdb/RVobeTFgk4j4dQ5tFDrTs9KEQjuSPgxuIE0EtCup0vqTyNZ8biL+XKSh4CuTLhO6NNLcKh8DPhE5LlelAlc6kTQB2CPqTCiUjYRZssn4DwPr1P4tSNqCtGTb/BGxeDPxbeCQNA9pWWdIk2DVOzirrKwQOTT79zXgxYgYWm6v+kfS5hFxe18jtvL4TGiFdrzsyKwI2aWo+5Mmd/x6QW18CdgocpwkVNI89XJ/dny6eERMbjL+WcCYiLi11/bhwI/yuHS36PMQSbcD34uIe+s89nTkMKeRpKNJn2P1Cgy/iojTc2hjZWBrao5/Sau3NH3y34pj+DrnIeuTCki5nIfMtG0XMDpPdvI0LdKkNrXbFwYOiCYnAJrNvtxYM1TN8Utoo13jK6eJmCRtDDyTjerp/VhXRHQ3Gf9gYGLvIaSS1iItr7plM/Gts3XQifOqwMbApqRh1s+SJvE8vtSO9ZOkkyLiBEmXZZt6Do5Euhxw7z5eWimSjiMN5b2ONO8S0D7LwPZQmrD7m6TLST8YKdcuvwcrn9LSyBERLxcQ+5PUrBwRES+2YxtFmcl5yMdJqyQ1OxHposDbkdNSoDNpp7ACgzXHBYwBrEUn5w9ExFqOX14b7RpfOU3EZFZldU6ca7XTifMfSEOQ7wLGR6+Z59tFduK8PekSnp6h4RERJ5fWqdmgtPJVbxFtsgxsj2yI/mOkUYUnk1YheTS8MpXNhNKSjmcBXyCtviBgIeB24Kje8xk0EH9N0ujKhZlxYsdXSBM7Nj3Zb4vaWJF0WUHt6hqjIuLRZmNb/2RfKh8NfJUP57v6f8BNwBkR8UqT8RcAjiBd3rkk8A5ptcWLIuLyZmLXaWN70nu0p42f5jjvTF2eA2Nga8UBTdEVsnaP34o2Khu/RRMxzaz944s8MSk6vrW/iDgh+3+vsvvSjIj4kqS5ScsiryDp8TYtYvyWdKIwkQ8nEm6bb3ryGDpdEctFxI6StomIK7LrrG8pu1NWedeRVqvbLdJEmD1zs+1IuhRggybjXw58KyL+UrtR0gbAZcAaTcYvvA1JR5KG+V8L9EygvgRwjaRrI+KMZuJnbRR9cl5o/Ba18StSYe3z8eGiCJ8izf92PdDs6N2rgN+QJrT9GmmeimuB70laPqfLnnra2KpOGyvkeWlVbx6BMYCp4KWMWtFGu8dvRRtVjq+0XG5fEzGdGxGLNdW5Wbdf6CgPjyKx/pJ0EOng9HXgYmBt0jeGTS3V1ipKa85fSVpKTaRvfPaIiDvL7NfskjQlClz6rWiSdq+3PSKubHVfmiHp/ohYT9KdpHkM/gHc324jSay1JD3Z1xwOM3ssp/h/jYjlmonfijYkPQGs0rvAnBWgH85pDoxbSCfnV/Q6Od+DtLJHUyfnRcefRRt7Al/IYR8ej4gVZvex2Yg/KSLWqLk/PiLWVZrs+ZGIWHEmL69MG33xCAwrWrHr6LR//Fa0UeX444EpfUzEdGITcWvjvNbXQ8DHqh7fBoy9I+ICSVsBnyDNsn4Z0BYFDNKy18MiW1Jb0vLANdTMot8m7pW0WjQ5UV6J1q25PS9pKP1EUnGpnYyUtAhwHDCKtDrCceV2ydrABEkXAlcw48SLe5CWd27W6OxyuSt7xd8duDmH+K1oYzrwaaD3JI6Lk88qJwBDIuLM2g1ZEeBMSXlcFll0/Jm1cYakPEZMPiPpCFKB5EX4YN6TPfnw996MNyUNjYi7JX0F+BeklXSk3NY4bUUbdbmAMbC14uT8SMcvvY0qx9+BD4dpzyDHodCvAOvWmwBLUh4fEkXHt4GhJx9/EbgsIiYVfQCQs7l6ihcAEfGE0izobUHSZNKlInMCeyktZ/sfPpzEc/Uy+9dfEfGd2vvZMOhflNSdhkXEJdnNcbTmclfrDLuTJn89iV4TL5LD0pQRcaCkEXw4f0RP/J9ExB+bjd+iNr4L3CbpST48UV4KWA44IIf4UPzJedHxW9HGTsBRwLgsLqSRZqNIl2M0az/gEkkrAJNJfxdIGgz8JIf4rWqjLl9CMoBJGtbo8GRJXcDZpAmGjiatZ7we8ASwb0Q0VemeycQwuUw+U3T8VrTRCdcYzkZfGp5wVtIppMmp7q/z2JkR0VSBp+j4NjAoTeL5GWBp0jXOcwB3RERbjGCQdCmpANBzsrwbMGe7zO2hjy6tPYPIcdnoVsqKSA9FxEpl92V2ZJ8/J5JWtgG4A/h+RLxaVp/MOkU2xH89ZiyQjO+ZNySH+IuQTs63AXqfnJ8ZTa6KVHT8VrVhjXMBowMVXVzI2rgfOIE00eJZwMERcYOkLwCnRMSGTca/iTQxzK30mhgGeL7ZiWGKjt+KNjrhGsPZ6EsrVoNZJSIebtf41t6yA8o1gaci4hVJ/wV8JiIeyh6v9PtH0jzA/wJDSQfEd5K+MXyn1I4NMJJ+x4eTjs4BrAT8KiKOKq9Xs0/SjcAU0qUAAN8A1oiIussNm/XILsP7KjOusHFTRDR9+YWkOUnfMn8kPvDz3vNKVLWNrJ22Xaa1U6jg1WAkLQtsS7oE6T3gSeCaPAvBrWijbrsuYHSeoosLWRsfnFCq10SFeZxsFj0xTCdMbtOCCYAKjT+bffFkqjagVf39I+mgiLhgVtusWEqTqfZ4D3gmIp4rqz+NkvRgRKw5q21mtST9gLQS0pWkUQWQRsDuDjwZTS7DK+ka0mWjV/SKvwewaETs1Ez8VrShGZdpfY5UcM51mdasnaJPzgtfCrbINjTjajC1v+edgaZXg5F0IPAV0mV4XwQeBF4mFRv2j4g7monfqjb64jkwOtNcETEaPhjCfgNARNwm6Zyc2nhb0jBSAgxJX42I32YHT3kMQSt6YphOmNymE64xrJIqT3ZqVvX3zx5A72LFnnW2WYEiYlw2Um490gH330ruUqPe6vn8BJC0EfBWyX2y6vtiRCzfe6Ok60ijkJsqYABr1/ny5jngPqXVPfJQdBuXU/BSsCp4qdai47eojW9SfzWY84CHgWbj7wOsGRHvZzH/GBGfl/Qz0miePEY1t6KNulzA6ExFFxcgTdxyFmnG4q2Ab0u6nHTZyj45xP82cLHSTPZTgL0h14lheuIXOfFM0fvQewKgAF4kvwmAio4/O1px8lb0cDQPd7NmVPL9I2kXYFdgaUmjah5aEPhnOb0auCT9D3A86fI/AT+SdHJEXFpuz2bbfsCVSnNhQPpWb48S+2Pt4W1J69WZk2pd+pgwfDa9LGlH4MaImA4fXP63I+k9moei25i/d/ECICLukzR/DvGh+JPzouO3oo1WrAYzJ+m8bx7SZzIR8XflO8F2K9qo26h1nqKLC0SaIf+7pD++57JheQcBSBqeU/zvANMjYryklSUdAjwWET/MIz7pG6oPSLoyInYHmo5frw1JQyV9mbRsaB5tfAP4cYGTRL4JPAKMjYhbJe0GfA74O/B6QW32xRNhmlXTvcALwGLAuTXbXwceKqVHA9vhwFoR8U8ApblU7iXNhdUWJM0BfD0i1pC0EEBE9LVctVmtPYGfSlqQD4flLwm8lj3WrJ2BM4ELJb1MKhJ+nFQw3DmH+H21sTDwp5zaaMVSsEWfnLfi5L/oNopeDeYSYLyk+4BNSO+pni9R85qAtBVt1OU5MAYYSXtFxGU5xDkQ2B94jDTx3EERcVP2WNPXaks6ARhBKrKNBdYnzUK+BXBLRJzaZPxRdTZvTvoQIiK2biZ+1sb9EbFedvt/SAnpN8Aw4Hc5DKF7lVRk+BtwNXB9RLzUXK9niH8V6ef/MeBV0iSkvwG+QModTX8bpmqtdHJfRGzQrvGts/n9Y/0h6TZgRM/kqZLmJg3r3aLcns0eSbdHxOZl98PaU3YZ1QcrbPRMRJ5zG/9FOhbK7birVW2o/jKtoyKnpWCzLzJ/TJrQ8SMn581OqFp0/Ba2UfRqMKuQJnKeEhGP5RGzjDbqtusCxsCiXhNuNhFnMrBhRLwhaQhwA/CLiLhA+UziOZlUGJmHtGzREhHxmqSPAX+JiNWbjP8AaQjYJaSh2QKuIatuR8S4ZuL3tFEz0el40rWZ07IhevdFxGrNxgfWIRV1dgK2BiaQ9uPXEdHUKAlJD0XE6kozYj8PfDq7zk3ApGZ/B1kbpa50ImnFIhNu0fGtcyitunApMLpn2HA7kbQd6duXT5DyqYCIiIVK7dgAkY1QhPS5uRqpCBykk5T7I2K/svrWCEnnAv8NXE8q1AMQEb8urVPWFrKTwp75xuYGVgWmRj5Lay4F/L+IeDs7FtoTWJs0WvXiiHiv2TaydhYAhjPjyg5j2umzoQUn54XGb1UbvdrbOiLqfcGaR+zlSPObPBoRj+QcezBpfpD3gKcj4o0849dt0wWMziOpr2G7ApaPiHlyaOORiFi55v4CpCLGI8Dm0eRM4b1O/mcoiCiHmcizpHQQadbcwyPiQUlPRcQyzcTt1cYk4PPAINKoka6ax/Io8sww0iW73mwEadKhLSJicJ8v7l/8KaQP5vlJl418NiL+JWle4IGIWKmZ+Fkbpa50kldBr6z41jn+P3t3HidXVad//POw7wgSFkGMGyiiwxI2BVTW4E8FEXRcQHBhXFFHR3HHBUVFAUFAVARGRQQUGMckRGRVo2xBNhGVsA1CULaAbMnz++PcSoqmO0vXrbpV3c/79epXqu6t+p5T3Z3TdU+d8/1K2gU4ENiWctF28iBNfkn6C/Aa15gBPhZftWpxRLY/36u+1EHScCtFbfvtPe9MDAxJewHfoSzxfzfwScoE2EbAe2z/T4fxrwW2tv2wpK8CzwXOpqzgpY7fT0lvoGwFuxp4JWUL2FKUicm32L6mw/hLA++kXHBOsf3btnOftv2lTuIvpN0165hEaou3TGvCqLoGeQGlDHlXty602uv0Ir2a9B/qOMrq9o4nayVdAOxr+x5J+wGfoZQ33wY40fYxncSv2tiEsu1+ImV1ylWUDzEuoqzM71op1eTAGJvWoeS+GJrsR5SBsA5/l7SZ7ZkA1UqMV1M+QexoZUHlMUkr2X6YssoAmL/loOMZ6GoW+0hJZ1T/3kX9/x9Wp6yIECWZ6rq2/14NtHUkpXxSjCrR0LnAudVKlU59n7JFaGngU8AZkv5GucD6SQ3xoQeVTiSNlG+ktXe1r+PH+GD7V8CvqjHuTcB0SbcB3wV+ODSRWB+6K5MXzRm0CYpFsX3gws5L+oTtr/SqPzEwPkf5lHlFygTAVrZvlPQs4CygowkMYKnqfSmU1a9bVe8nf1h9aFWHTwPbVpMkawE/sr27pJdQJmde2mH8wohCEQAAIABJREFU7wArUSprHCPpItutFVx7Ax1PYLRPhFQXuWcDy1arVt7oYZKILmH8A4BvSPoH5cPIbwM3AxtJ+pjt0zp6AYt2PeWCvRM/peQcuZsF7+dXppQlNdDparMJbVuPDqasmv+HpJWAGUDHExiUa763Vf/HtgbeZ3sbSe+iXEPsU0Mbw8oExtj0C2CV1uRCO0kX1tTG/pSlQvNVM6H7q5TP6dSOth+t4rZPWCxLjZnIbd8O7Cvp/1GSPNXG9sQRTs2j1Eju1Ii1wG13XG7O9pEqpcew/X+STqX8wf6un5rhe7RalU4urCYuoP5KJwcCHwEeHebcmwYgfowT1X7nt1IS9F4F/AjYnjLmvaK5ni2Wy6vx4mza/i9kyX9vVZ+6PWVp7RjMJ7EvkAmMeIq27ai32r6xOnZLa2tJh26TtJPtXwOzKFs8bqnG7rqIBSWDH6J8oo3tP6pKatuhrVtbgCUdS0kW+jPK+5W6Kr61T4R8nfJp/JTqIvcoOp+E+QiwMaXqxdWUxMV/rd5HTqdspe5I27a8p5wCVuk0PrAdpZLJZcAJti3pFYuavF0Cj0ta3/YdwBwWbMV7lPLBZB1WbPs/9gdJJ1S3vyvpwzW1MaxMYIxBtt+xkHNvrqmN2xdy7jc1xB/uYpBqNrH2hEm2/xf437rjjtDWw5SZ4k7j1FVzfGFt/F/b7fso24TqjH+vpO9SfqatvZ5/Bk6rcenZZZTkQk9ZfSTp0AGIH+NA9QbyBcB/U7Zi3FmdOl3S5c31bLGtBjxMSVLcUsenSLFkPtp2ewXg9Qz5sGGM6EVp7RhAkpaqPvh6e9uxpYHlagj/Tkp530Mpyc1nVvnI1gBGuuBdUr8Epkq6iLIt+AwoWzCo5/d+/veh+uDxIEmt0st1XJgP9QzbU6r2/lDTCuG5resBSXNs/7WKf1dZ5FGLL1MmX4YbPzueDHOpsLgr8AHg15I+Tr3l0j8MnFfl17quamMqsAPQcTGHyl8lfQY4nzJpNRPmb2nv6hxDcmBERGNUqtm8mrIv71WUwe9eygqV99q+sIY21gQeaVv2Watux4+xr/pk8NO2v9B0X0ZL0gq2H2m6H/FU1RLxlzfdjzoNzQEVASBpK+CaoWORSrL57W3/sKZ2XkjJq7EMCxI71pZgU9KrgE0oCdOnV8eWApYd6QO+JYj9Q8q2xKlDjr8TON72sp3Er2LdR3lfJ8q242e13iNJutb2ph3GP5dyUb4q5ft0FWWyfBfgpbZ37yR+1cZvgQ/YvmKYc7fZfmanbbTFWx84EpjkenPxrQ68mSf/rp5TV34tSU+j5JnZhLIS5nDbD1btvtD2jDraGbbtTGBERFNUVZtxqW6yEqXc3ytUMn2f4w4TnUYMCkm/s71d0/0YrSqJ513AJZQ3rr/pZgKvGF41odqyFDAJONpdTojca6ohEXaMX5LOsv36Lsbv+njez38zJA2dML2iypW3DrCP7W93GH814H2UFQvHUvL+HQjcAnypbQVjJ21sDPzT9uxhzq3TytsWzcgWkoho2jLAXErJ3FUBbN9aLUHrWDUT/AlgL6BVmeVuSpnBw6utMV0haYrtPboVP8aU8yS9nlICeeA+WbD9vGricQfKqqrjJN3nDitGxRK7ggWlwR+n7NMfcVvpADuj6Q7EQKvtU+4RrNDl+F1pQ9KurRUfnbB90QjH76Ik3Ow0/gM8OQfOWdVXbVq5HUY41/HkhZ5cDWZq+/Z71VANRj2oNtOLNkaSCYyIaNL3gMskzQB2BL4K82tK11UK66eUvZ2vaEvutS4lMeIZwK6dBJc00jJmAbl4i8X1n5QM5E9IeoSqepHtOpK2dZ2kDYCXUSYw/o2yvPfSRjs1Pn2c8mb4gWpv8haU3CQDQdIxLGQfuO2Dq3+/3LNOxVjU7UniXkxCd6ON79N5dY2FknSi7YP6PX63Jxh4cjWYb3WhGkzXq830qI1hZQtJRDRK0ouAF1ISYdayL29I/BtHWj69sHNLEH8upeb1cJmjtrVdR8KqiL4maR4loe2XbZ/TdH/GK0l/tP0SSdtTktB9A/ik7W0a7tpikdSqMvYyyr7q06v7+1KWoXc1s32MD93OodKLHC2jbaPKHzHsKWAn2yt31rOnbGUb2sbVtjfo5/hVG99jwcX5fsD8i/M6fr6tsbq6vQxwHLAWpRrMjE63yHU7fq/aGElWYEREo2xfR/m0tltukfQx4JTWsr9qH+YBwG01xL8B+A/bNw09IamO+DEOSDrf9s6LOtbHNqeUfH2zpEOAmyhv+L7fbLfGnbnVv/+PUprvnEGqhmT7FABJBwCvtP14df8E4LwGuxZjS7er2PSiSs5o29iBUq57zjDxtu6oRwvMpuSjaO9ja2vb2gMQH7pfbrbb1WB6UW2m1xVt5ssERkSMdW8EDgEuktT6w3YXcC7lU71OHcrIJbU+UEP8GMMkrUD5lGctSWuw4I3RasAzGuvYErJ9taS/An9lwRvkHSlLkqN37pD0HUo2/q9KWp4aSv414BmUnEitrYSrMED/H6J/SFrb9t1DDn+8y83uV0eQaqWBbd9bYxszgIeHy1MhacS8D0vob8DOtm8dpo06Ptjpdnzo/sX55ZImt1eDsf0FSf8HHD8A8XvVxrCyhSQixi1JB9quqx52z+PH4JP0QeBDlIuzO1gwgfEA8F3bxzbVtyUh6XJKIt7fUnJfXGz7lmZ7Nf5U1ZwmU0pJ3iRpPeDFtgdq9YKkAymTwxdUh14OHNpaoRExnGG2FoiS2HZzyjVPR7m1JP2TUq7zNODX3Ui4XCVD/hqwM3Af5TWsRrlwPsT2rLrbrJuk9wGX2r56mHMfsH1MP8ev4nS93Oxi9qOWxKpNxe9WG5nAiIhxS9KttruWsKrb8WPsqOtNV1MkTRiu3FzEaFXJllu5O37fSsIcMZIqF8/QidMNgNspKxk6qj5SrVA4hrKNYCJwJnCa7RmdxB3Sxu+Ao4Azbc+tji1NWTH6Idvb1tXWovrRg1KwuThfdPxxm69loTEzgRERY5mkP450CtjI9vL9HD/GD0kvpbwpnr+90/apjXVoCVTlij9H2TYCJbHtF2zf31yvYlBJEvAW4DnVkuQNgXVt/6HhrkUfk/RRyvap/7J9TXXsZtvPrin+/Aux6nfy36uvpwE/sf3JGtq4yfbzl/Rc3SRd1c0kjFUbuThfdPyu/hx69HOuvY3kwIiIsW4dYHdg6B5SUZa793v8GAck/TfwXGAmCxIxGhiICQzgJOBa4A3V/f2AH1BKqUUsqeOAecBOwBeAB4GzgK2a7FT0N9tHSPoJcGSVC+Fz1FtudH7yxir/wteAr0namDKRUYcrJB0HnMKCROPPpJR+v6qmNhZHLz7hHs/JVBdXSv4OIxMYETHW/QJYxfbMoSckXTgA8WN8mARs0o091T3yXNuvb7v/eUlP+T8RsZi2sb2FpKsAbN8rablFPSnC9u3AvpJeA0ynJEmuywXDHbR9I/D5mtrYH3hHFW99ygXy7ZTE42MtKXIuzmNUMoEREWOa7Xcs5Nyb+z1+jBvXAusCdzbdkVH6l6TtbV8KIOllwL8a7lMMrserff+GkmOFsiIjYrHY/h9J9wEvl7RbHYlsbf9nDV1bVBuPUSo4dLWKw2LoxeqFWLRZAx6/K20MYmmtiIiIsWYt4HpJ0ySd2/pqulNL4N3AtyXNkjQLOBb4j2a7FAPsW8DPgbUlHUapbPPlZrsU/U7SH9puv4vye7Q08DlJh3SpzV93I24v25A0XA6HWkrBLsKsAY/fURuSdqy2HyFpe0kflfT/2h9ju/ZtmJJ27UZ8Sc+WtLekF7Qf78prGNzVqhEREWODpJcPd9z2Rb3uy5KQ1P6JpICVq9sPUbL+f7P3vYqxoHoTvDPl9+p82zc03KXoc+3JAiVdBrzK9mxJKwMzbL+4w/hDk3YL2Ai4EcD2SzqJ34s2hpmsEHAO8BrKdeGVncSv2ngtcJ7tRzqNNYq2u1YVRNKX60jUWsU6CtiashtiGmWsm0IpGX2V7f+qo50R2q6lQp6ks23vVd3ek1I950LgpcBXbJ/caRsjyRaSiIiIhvX7RMVCrFr9uzElweI5lDfEbwUubqpTMdgkHQ2cbvvbTfclBspSktagrDBXq7Sz7YckPVFD/FnAA8CXKFvkBFxCufivS7fbuByYATzaduzpwDcpW7Z2qqGN04GHJE0BTgOmtUrC9sD3gTouzr819BCwn6RVAGwf3GETuwKbAisCdwDr235Y0uGUZK0dTWAsZAWnKD/vOjyr7fbHgZ1s3yxpLeB84OSa2nmKTGBE1EDSocAc20cMOf5u4OFBKYUYEc2Q9CALkoEtBywLPGR7teZ6tWi2Pw8g6TxgC9sPVvcPBc5osGsx2K4EPi1pI8pWktNtX95wn6L/rQ5cQblIs6R1bf+9uujsOKeD7ddKeh1wInCE7XMlPW77lk5j97CNNwAfAL5u+5cwv9TsK2uKD/AnykTIPsBHgB9I+jlwWh2T9T26ON+bsprgPBb87vw75ferDrZtSa3cPq2///OoJ8XDDpQPEuYMOS7Kyo86tG/jWMb2zQC272l7XV2RLSQRNRhuAkPSMrbrmPGvNVZE9D9JewFb17Vctdsk/Qn4N9uPVveXB662/YKFPzNiZJLWBF5PuXDY0PbzG+5SDCBJKwHrtC6waoi3MvBF4HmUidsN6ojbqzaqCZ0vAhtQJhgutP2cGuNfaXuLtvvrUiZO3gRsYPuZHca/l5Evzk+3vU4n8as2VqV8j9YG/sv2HZL+Vtf3SdJXKVstVqBMlLyAsjLm5cDfbL+7w/hTgK/ZfkrlHEkX296xk/hVnLmU7aIClqeM0X+vKkZdXseWqhHbzgRGxOhI+hSl3NVtwGzKrOyrgd8CL6OUvFqVMsD+L3CK7a2r504EzrX9EklbUpburQLcAxxg+86qBOf8WLa/MUwfXgN8mvKJ7T+At9i+q8rY/mPKTPRlwGRgy2pW9K3AwdVzfg+8t4dL+yJiMUmaYXvbpvuxOKrx8A2UT8sNvI7yRvIrjXYsBpqkrYE3AnsB19uuc6l+REck/Ruwne0TBrENSZsBRwKb2p5QY9z5uUiGOfesTleT9OLivC3elsARlPfx77c9scbY21FWYsyQ9FzK381bgTNtD2zVJUlPA15o+3ddayMTGBFLrhrQTga2oWzFuhI4gTKBcb3t91aPO5RqZYakmcDetv8m6eOUJeJfBS4C9qwSTb0R2N3226sJjPmxRujHGsB91TK0d1IGjI9IOha4w/ZXJE2mJAaaUH19rerH45KOoyS2yhaXiAZJas/SvRQwCXi57e0a6tISq5LD7VDdvdj2VU32JwZX9enk3sBfgZ8CP7N9X7O9igBJrSX461Mma/8P+INrvKDqRRtD2lrV9gM1xnyF7Qvrite06nv0XspE0lu7EH9NykTGvXXH7kX8qo01gCda20i7LTkwIkZnB+Dnth+Gp+zHO32E5/yU8gnl4ZRPlN5ISXy3KTC9jI8sDdy5GLFaNgBOl7QeZUVFa3nk9pSZXGxPrZbbQclyvCVwWdXeisDdi2gjIrqv/ZPlJyiJ3PZspiujU2Wv7ziDfQTlb9l2tu9puiMRLZJ2A44DbqIkXoTyPux5kt5r+7x+b6O6GN+XMjFyJiVXxZ7VNsAT6vjkfyxNXkC58ge+XX3VQtKGlA8UdwLuL4e0GvBr4BDbs2qKvzNwX93xqzaeQbmm2ZOyivyO6triJOAw24932sZIMoERMXojzYQ/NMLx04EzJP2MMh7eJOnFwHUL+ZR1pFgtxwDfrJI8vQI4tDo+UrIqUbayfGIRcSOih2wf2HQfIvqF7RMkrVFtIVmh7Xgq20STjgZ2GXrxJ+nZwC+BFw5AG9+m5HVYjnLhuTzwP8CrKB+qfbDD+Eh6JvB1ygqSKZSEoY9X5+aX3uzX+IvR/jXusCQv5ZrgKMrW77lV3KUpk0s/ATrdPtrt+AA/BL5ge/9qFekOlG3tn6D8nh1UQxvDqiPLacR4dDHwOkkrVol+Frkv1/ZfgbnAZ1iwsuJGYEK1Dw5Jy0p60RL0Y3UWzNC/re34pZTVHq3Z/DWq4+cD+0hauzq3pqT2MkgR0QBJG0j6uaS7Jd0l6SxJtSeGixgE1ZbIi4FpwOerfw9tsk8RlA9+bx/m+B2UbcGD0MYOtvehJMfdg3KBeyolKWYdJVShfAJ/IaXayXrARZJa1UHqeM/Z7fhI2nuEr9cD69bQxFq2T2/PQWd7ru2fUE8llW7HB3h6a7WN7Z8BO9p+yPangdrykAwnKzAiRsH2lZJOB2YCt1BqdC+O0ymzxs+u4jwmaR/gW5JWp/yfPAq4bjHjHUpZ1XEHJXvxs6vjnwdOq3JqXETZlvJglcTz08B5kpYCHgfeV72GiGjODyiJd/et7r+1OrZrYz2KaM4Hga0oOZpeKekFlL9rEU06ibIF9yeUBO4Az6RUyfn+gLTxBECVB+0y249V95+oqkrUYUJb0tEPVMnjL5b0WkZevdxP8aG8X//RCPFWGObYkrqiykN3Ck/+Ob8NqCN/VLfjA8yuvve/pkyIzYL525S6ukgiSTwjxiCVEoZzqz9I2wHH296s6X5FxPAkzRz6f3S4YxHjQXVhtVWV/Hob24/m/0P0A0mbAK+lbF8QZbXEubavH4Q2qgoe+9qeM+T4ulUbW9fQxnWUynePtB3bhZLsfmXb6/Vz/CreFcDbbF87zLnb3Hkp2OWAd1C28Tzp5wx831VJ8n6NX7WxIaVCyyaUD3T/q6qi+HTgFbbP6rSNEdvOBEbE2CPp+ZSkoUsBj1FKpV7WbK8iYiSSfkWpbHRadehNwIG2d26sUxENkfRz4EDgQ5Rl7fcCy9p+VaMdixijJK1MufjvOLG7pA8DV9q+aMjxzSnlTztaWdjt+FWsHYBbbN86zLlJti/vtI0YvUxgRAwASZ9iwdLyljNsH9ZEfyKiXtUnGccC21GWrP4WOHi4N08R44mkl1PyPU1tLXePaEK11fcTwF6UsvRQKrmdAxxeR6nfHrXRszKtMTJJu1N+zu0/h3NsTx2E+FUbr6RsH3kmZXvSTcD3bP+lrjaGbTe/qxEREc2SdArwoVad9qpu+xG2395szyKaIWkLSklwA7+pyvRGNEbSNMp+/1Ns/706ti5wALBzTZ/8d7WNhZVppazW7bgUbNXOWLg471obko4CNgJOZUHS1g2A/YGbbHdUDabb8as2DgfWoRQI2ItS/vrPwHuBL9s+o9M2Rmw7ExgRERHNknSV7c0XdSxiPJD0Wcqqw59Vh/airDr8UnO9ivFO0o22N17Sc/3UhqQbgD1GKtNqu+NSsGPk4rzbr+HPtjca5riAP9t+fj/Hr2LNLycraRngItsvk7QGcIntTTttYySpQhIREdG8pSStMWQFRv5Gx3j1JmDzVpK+6pO+K4FMYESTbpH0McrqiLsAJK1DWR1x28Ke2Edt9KIU7KtGuHg+nfIJfacTDN2O34s2HpG0te0/DDm+FfDIcE/os/gA8yStafufwDOApQFs31tNlHRN3hxFREQ07xvAbyWdSVmq+gYgOW5ivJpFKVXYeqO9PPDXxnoTUbwROAS4qJpUMHAXpbLDGwakjV6Ugh0LF+fdbuMA4HhJq7JgQumZwAPVuX6PD/Bl4CpJNwIvAN4DIGkCcHVNbQwrW0giIiL6QFU6bydKubPz6yzLFzFIJJ1NuVCYTrmA2xW4lJLMENsHN9e7GM8kbQ3Y9mWSXgRMBm6w/cua4m8D/Mn2/ZJWokxmbAFcR8krcH8NbXS1FGyVv+Z4YLiL5/favqKf4/eqjaqddWn7ObTyntSlB/HXBJ4D/KWOBLOL3W4mMCIiIiKiX0h628LO2z6lV32JaJH0OWAPygr26ZRKHhcBuwDT6qgMJ+k64N9sPyHpROAh4Cxg5+r43p220SuDfnHe7Ta6XQ2mV9VmJE2irQqJ7T/VGX/YNjOBERERERERMTJJ1wCbUbY0/R3YwPYDklYEfm/7JTW0cUMrkaakK21v0XZupu3NOozf9TKtVTsDf3HezTa6XQ2mF9VmqhLX3wDuA7YEfgOsATwO7Ge7rrwwT5EcGBERERHRNyS9DDgUeBblvaooy/af02S/Ytx7wvZc4GFJf7X9AIDtf0maV1Mb10o60PYPgKslTbJ9uaSNKBeGnfoppUzrK4Yp03oGZbtWRxZ28SypqxfndcTvURtHA7uMVA0G6LQaTLfjAxwF7GZ7dhX3m1UVkl0p+VR2q6GNYWUFRkRERET0DUl/Aj4MXAHMbR23/Y/GOhXjnqTfA6+0/bCkpWzPq46vDlzQvlqigzZWp1x87gDcQ8l/cVv1dbDtjpIj9qgUbFdLtfaoFGy3X8NNwAttPzHk+HLA9baf18/xq1h/bK06krQ0cFnr/4Ck62y/qNM2RpIVGBERERHRT+63PaXpTkQMsaPtRwFakxeVZYGF5m1ZXFWSzgOq6hHPoSp72iqpWoNelILtdqnWXpSC7XYb3a4GM1z8DSlVbuqqNnO5pO8D5wN7AhcCVMlnl66pjWFlBUZERERE9A1Jh1PeAP8MeLR13PaVjXUqYgyQtAalsslrgXWqw60yrV+1/c8a2vgEpeTrcBfnP7X9lX6O38M2Xki58O9WNZhux18WeBewCaVs6km251Y5Yda2fUsd7QzbdiYwIiIiIqJfSLqgutl6k9rKgbFTQ12KGDMkPQ94HQsqR/wZOK2OEq1tbXS7VGtXL86rNrr6GnpN0tq27+5yG0/vxVa/TGBERERERN+oylUOZdtf6HlnIsYQSQcDrwYuBl4FzATupUxovNf2hc31LgAkTbG9R4cx1hzm8JXA5pTr/zpW2hwOHGH7nqqU6k+BeZQtNvvbvqjTNkZsOxMYEREREdEvJH2k7e4KlAuuG2y/vaEuRYwJrVKw1VL/lSgJKV8haUPgHNub19DGapRSrRtU8U9rO3ec7fd2GH+y7anV7dUppTy3Bq4FPlxHvpDqgvzrlJwXn6DklNiKUpXkINtXdRh/pISvAn5he70O488Dhm7h2ICyiqSWik6SrrH94ur2BcDHbF9WVcz5se1JnbYxkiTxjIiIiIi+Yfsb7fclHUHZox8RnVuGUt1neWBVANu3VjkN6vADyoX+WcDbJe0DvLlKgLptDfG/DEytbn8D+DvwGmBv4DvAXjW0cRzwOeBpwG8pEyO7Stq5Orddh/EvAy6iTFgM9bQOYwN8DNgF+C/b1wBIutn2s2uI3bKspGWqSicr2r4MwPafJS1fYztPkQmMiIiIiOhnK1EqMkREZ75HqU4xA9gR+CqApAlAx9sKKs+1/frq9tmSPgX8WtJra4rfbpLtzarbR0qqpRoMsGyrEpKkr9o+E8D2+dWEaqduAP7D9k1DT0jquBqM7SOqCiRHVvE+x4KcQnX5NvDLaivJVElHURIv70zZmtQ1mcCIiIiIiL5RLXNvvdleGpgAJP9FRIdsHy3pV8ALgW/a/lN1fDZlQqMOy0taqlVq1vZhkm6n5N1YpYb4a0v6T8rqhdUkyQtyIixVQ3yARyTtBqwOWNJets+W9HLK6pVOHcrIff1ADfGxfTuwr6TXANMpE8G1sX1MNVa/B9iIMq+wEXA28KU62xoqExgRERER0U9e3Xb7CeCuaplyRHTI9nXAdV1s4n+AnYBftbV5iqS7gGNqiP9dqq0vwCnAWsBsSetS3yf/7wa+RklKuTvwHkknU3JivKvT4LbPlPSCakvK723PaTv9SKfxASS9gFJB5QLKz+K51fH5OURq8HfgRIa8BkmTWbDNp3ZJ4hkRERERERFdJelA2z8Y1Ph1tVFVg3kfZSvJZsAHbZ9TnbvS9khJPvsifq/aGLHtTGBEREREREREN0m61faGgxq/rjaqrRfb2Z4jaSJwJvDf1RafqzqtBtPt+L1qYyTZQhIREREREREdk/THkU4B6/R7/B61sXRry4XtWZJeAZwp6VkMX5mk3+L3qo1hZQIjIiIiIiIi6rAOJW/EvUOOi1KStN/j96KNv0vazPZMgGoVw6uBk4AXD0D8XrUxrExgRERERFQkLW27jizzERHj0S+AVVoXtu0kXTgA8XvRxv6UBMXzVYmK95f0nQGI36s2hpUcGBERETFuSDobeCawAnC07RMlzQG+SfnE7SPAROBgYDng98B7bc+VdDywFbAicKbtzzXwEiIiIsatumrlRkRERAyCt9veEpgEHCzp6cDKwLW2twH+AbwReJntzYC5wFuq537K9iTgJcDLJb2k992PiIgYv7KFJCIiIsaTgyW9rrr9TOD5lEmKs6pjOwNbApdJgrLa4u7q3BskHUR5/7QesAkwUrK3iIiIqFkmMCIiImJcqLKk70Ip/fZwtZd5BeCRtrwXAk6x/Ykhz3028FFgK9v3Sjq5em5ERCyCpNcCm9g+XNKhwBzbRzTcrRhA2UISERER48XqwL3V5MULgG2Hecz5wD6S1gaQtGZVFm414CHgfknrAHv0qtMREYPO9rm2D2+6HzH4MoERERER48VUYBlJfwS+CMwY+gDb1wOfBs6rHjcdWM/21cBVwHWUMnG/6VmvIyL6mKSJkv4k6XuSrpX0I0m7SPqNpJskbS3pAEnHDvPc50qaKukKSZdUk8tIeo2k30u6StKvqoljJE2QNF3SlZK+I+kWSWtV594q6Q+SZlbnlu7tdyJ6IVVIIiIiIiIiYlQkTQT+AmxOmeS9DLgaeAfwWuBA4Gxgku33t28hkXQ+8G7bN0naBviK7Z0krQHcZ9uS3gm80PZHqkmQO2x/RdJkYAowofr6GrC37cclHQfMsH1qz74R0RPJgRERERERERGduNn2NQCSrgPOryYfrqGUpn4KSasALwXOqJImAyxf/bsBcLqk9SglrW+ujm8PvA7A9lRJ91bHF5aAOcaQTGBEREREREREJx5tuz2v7f6IDAn9AAAgAElEQVQ8Rr7mXIqyymKzYc4dA3zT9rlVAuZDq+Ma5rGt409JwBxjT3JgRERERERERE/ZfgC4WdK+ACr+rTq9OnBHdfttbU+7FHhD9fjdgDWq4yMlYI4xJhMYERERERER0YS3AO+QdDUlf8ae1fFDKVtLLgHuaXv854HdJF1JqQZ1J/DgSAmYe/MSopeSxDMiIiIiIiL6nqTlgbm2n5C0HXD8CFtQYoxKDoyIiIiIiIgYBBsCP5W0FPAY8K6G+xM9lhUYEREREREREdH3kgMjIiIiIiIiIvpeJjAiIiIiIiIiou9lAiMiIiIiIiIi+l4mMCIiIiIiIiKi72UCIyIiIiIiIiL6XiYwIiIiIiIiIqLvZQIjIiIiIiIiIvpeJjAiIiIiIiIiou9lAiMiIiIiIiIi+l4mMCIiIiIiIiKi72UCI6JBkk6W9KWm+xERMd5k/I2IaEbG3+hEJjAiBoCkCyW9s+l+RESMNxl/IyKakfE3hpMJjIiIiIiIiIjoe5nAiOghSZtLulLSg5JOB1aojq8h6ReSZku6t7q9QXXuMGAH4FhJcyQdWx0/WtJtkh6QdIWkHRp7YRERfS7jb0REMzL+Rp0ygRHRI5KWA84G/htYEzgDeH11eingB8CzgA2BfwHHAtj+FHAJ8H7bq9h+f/Wcy4DNqlg/Bs6QtEJvXk1ExODI+BsR0YyMv1G3TGBE9M62wLLAUbYft30mZRDG9j9sn2X7YdsPAocBL19YMNs/rJ73hO1vAMsDG3f5NUREDKKMvxERzcj4G7XKBEZE7zwDuMO2247dAiBpJUnfkXSLpAeAi4GnSVp6pGCSPiLpBkn3S7oPWB1Yq5svICJiQGX8jYhoRsbfqFUmMCJ6505gfUlqO7Zh9e9HKLPH29heDdixOt56bPugT7Xf7+PAG4A1bD8NuL/t8RERsUDG34iIZmT8jVplAiOid34HPAEcLGkZSXsDW1fnVqXs+7tP0prA54Y89y7gOW33V61izQaWkfRZYLVudj4iYoBl/I2IaEbG36hVJjAiesT2Y8DewAHAvcAbgZ9Vp48CVgTuAWYAU4c8/WhgnypD87eAacAU4M+UZXiPALd1+SVERAykjL8REc3I+Bt105O3I0VERERERERE9J+swIiIiIiIiIiIvpcJjIiIiIiIiIjoe5nAiIiIiIiIiIi+lwmMiIiIiIiIiOh7yzTdgRidtdZayxMnTmy6GxFRgyuuuOIe2xOa7kcsnoy/EWNHxt/BkvE3YuwY7fibCYwBNXHiRJ7+pxc13Y1hTZtzCgC7Lv3GhnsysulzTwdg16X2bbgnI5s+7wwAJq/+9oZ7Mryp958EDMb3cI8NP9RwT0Y25dajkHRL0/2IxTdx4kTWuPLZTXdjVFr/J/p5fF6U+eP3WHgNfTx+Lsr836UBfw0ZfwdLN8ffbv9O9+L/TKuN3ZZ7c1fin/fYj7sav72NQf059PLnPOivYbTjb7aQRERERERERETfywRGRERERERERPS9TGBERERERERERN/LBEZERERERERE9L1MYERERM9IOknS3ZKubTu2r6TrJM2TNKnt+NMlXSBpjqRjFyP2YZJukzRnyPH/lHS9pD9KOl/Ss9rOTZV0n6Rf1PUaIyL6TcbeiBgrMoERERG9dDIwecixa4G9gYuHHH8E+Azw0cWM/T/A1sMcvwqYZPslwJnA19rOfR3YbzHjR0QMqpPJ2BsRY0AmMGo2dPa57fi7Je1f3T5A0jN627OIiObZvhj455BjN9i+cZjHPmT7Usqb6cWJPcP2ncMcv8D2w9XdGcAGbefOBx5cgpcQETFwMvZGxFiRCYwesX2C7VOruwcAmcCIiOi9dwBTlvRJkg6SdLmky2fPnt2FbkVEjGmjGnsh429EPFkmMJaQpI9JOri6faSkX1e3d5b0w+r2YZKuljRD0jrVsUMlfVTSPsAk4EeSZkpaUdKWki6SdIWkaZLWa+r1RUSMVZLeShl/v76kz7V9ou1JtidNmDCh/s5FRIxRnYy9kPE3Ip4sExhL7mJgh+r2JGAVScsC2wOXACsDM2z/W/XYd7U/2faZwOXAW2xvBjwBHAPsY3tL4CTgsOEazgx0RMToSNoF+BTwWtuPNt2fiIjxIGNvRNQtExhL7gpgS0mrAo8Cv6NMZOxAmcB4DPhF22MnLiLexsCmwHRJM4FP07ZHsF1moCMilpykzYHvUN5A3910fyIixoOMvRHRDZnAWEK2HwdmAQcCv6VMWrwSeC5wA/C4bVcPnwsss4iQAq6zvVn19WLbu3Wl8xERDZN0GmXid2NJt0t6h6TXSbod2A74X0nT2h4/C/gmcED1+E0WEvtrVZyVqsceWp36OrAKcEa1de/ctudcApwB7Fw9Z/d6X3FERPMy9kbEWLGoi+sY3sWU0lJvB66hDPBX2LakxXn+g8Cq1e0bgQmStrP9u2o7yka2r+tCvyMiGmX7TSOc+vkIj5+4BLE/BnxsmOO7LOQ5O4x0LiJirMjYGxFjRVZgjM4lwHrA72zfRSkzdckSPP9k4IRqy8jSwD7AVyVdDcwEXlpvdyMiIiIiIiIGW1ZgjEJVu3rZtvsbtd1epe32mcCZ1e1D246fBZzVFnImsGP3ehwRMXZI+j2w/JDD+9m+pld9mD7vjF411RXT557edBc6NiZew4D/HsHYeA2xePph7IXu/84NenyA8x778UDHh8H/OfTi5zwWXsNoZAIjIiIGiu1tmu5DRMR4k7E3IvpBJjAiIiKW0K5L7dt0F0al9WnKoPYf2l7D0m9suCej11o9MiZewxj4XYrB0q3fuW6Pj70Yf3v1GnZb7s1diQ8LVncM6s8hP+fF08kqnuTAiIiIiIiIiIi+lxUYA2zanFOa7sJCDcL+5EH49GXq/Sc13YWFGoTv4ZRbj2q6CxERERER0aGswIiIiIiIiIiIvpcVGANs8prvaroLw5r6z+8CsPvK+zfck5FNe+hUoL/3H7dWsOwx8cMN92R4U2YdCfT3/ufW6pDtX39Ewz0Z2aVnfbTpLkREREREDISswIiIiJ6RdJKkuyVd23ZsTUnTJd1U/btGdfzpki6QNEfSsYsR+zBJt0maM+T4jpKulPSEpH2GnJsq6T5Jv6jrNUZE9JuMvRExVmQCIyIieulkYPKQY4cA59t+PnB+dR/gEeAzwOIuU/kfYOthjt8KHAAMl/L668B+ixk/ImJQnUzG3ogYAzKBERERPWP7YuCfQw7vCbSyEp8C7FU99iHbl1LeTC9O7Bm27xzm+CzbfwTmDXPufODBxX8FERGDJ2NvRIwVmcCIiIimrdN681v9u3bD/XkKSQdJulzS5bNnz266OxERdej7sRcy/kbEk2UCIyIiYhFsn2h7ku1JEyZMaLo7ERHjRsbfiGiXCYyIiGjaXZLWA6j+vbvh/kREjAcZeyNi4GQCo89IOlRS6ipGxHhyLvC26vbbgHMa7EtExHiRsTciBk4mMHpERb7fETGuSToN+B2wsaTbJb0DOBzYVdJNwK7V/dbjZwHfBA6oHr/JQmJ/TdLtwErVYw+tjm9VHd8X+I6k69qecwlwBrBz9Zzda37JERGNy9gbEWPFMk13YCyTNBGYAlwAbAecLenVwPLAz21/rnrcp4D9gduA2cAVTfQ3IqLbbL9phFM7j/D4iUsQ+2PAx4Y5fhmwwQjP2WFx40dEDKqMvRExVmQCo/s2Bg4Ezgb2odTJFnCupB2Bh4B/Bzan/DyuZIQJDEkHAQcBbLjhhqzV9a5HRERERERE9IdMYHTfLbZnSDoC2A24qjq+CvB8YFXKaoyHASSdO1Ig2ycCJwJMmjTJqZ4dEeORpN9TVrK128/2Nb3qw/R5Z/Sqqa4Y9P4DTJ97etNd6NiYeA1j4HcpFk8/jL3Q/d+5QY/fizbOe+zHXY0Pg/9zyM+5ezKB0X0PVf8K+Irt77SflPQhwD3vVUTEgLK9TdN9iIgYbzL2RkQ/yARG70wDvijpR7bnSFofeBy4GDhZ0uGUn8drgO8sJE5ERDRs16X2bboLo9L6tGb3lfdvuCejN+2hUwGYvNZBDfdk9KbecyIAe0x4d8M9Gb0ps08AYLfl3txwT0avXz9djIXbY/0PdCXulDuOAbo3PrbGrm7+/WiN8d36f9n6P7PHhh/qSnyAKbceBXT/NQxq/F600Yo/eY13diU+wNR7vzfq52YCo0dsnyfphcDvJAHMAd5q+0pJpwMzgVuASxrsZkRERERERERfygRGF9meBWzadv9o4OhhHncYcFjvehYRERERERExWJZqugMREREREREREYuSCYyIiIiIiIiI6HuZwIiIiIiIiIiIvpcJjIiIaJykD0q6VtJ1VXlpJK0pabqkm6p/11jI858u6QJJcyQdO+TclpKukfQXSd9Sl
gitextract_19gb0anp/
├── .gitattributes
├── .github/
│ ├── dependabot.yml
│ └── workflows/
│ ├── test_matrix.yml
│ ├── tests.yml
│ ├── valgrind.yml
│ └── wheels.yml
├── .gitignore
├── .mbuild.sh
├── .readthedocs.yml
├── CHANGES.rst
├── CMakeLists.txt
├── LICENSE
├── NOTICE
├── README.rst
├── docs/
│ ├── Makefile
│ ├── README.rst
│ ├── autogenerate.sh
│ └── source/
│ ├── code.rst
│ ├── conf.py
│ ├── developing.rst
│ ├── index.rst
│ ├── introduction.rst
│ ├── phik.decorators.rst
│ ├── phik.rst
│ ├── phik_index.rst
│ ├── publication.rst
│ └── tutorials.rst
├── example.py
├── phik/
│ ├── __init__.py
│ ├── betainc.py
│ ├── binning.py
│ ├── bivariate.py
│ ├── data_quality.py
│ ├── decorators/
│ │ ├── __init__.py
│ │ └── pandas.py
│ ├── definitions.py
│ ├── entry_points.py
│ ├── notebooks/
│ │ ├── phik_tutorial_advanced.ipynb
│ │ ├── phik_tutorial_basic.ipynb
│ │ └── phik_tutorial_spark.ipynb
│ ├── outliers.py
│ ├── phik.py
│ ├── report.py
│ ├── resources.py
│ ├── significance.py
│ ├── simcore/
│ │ ├── __init__.py
│ │ ├── asa159.cpp
│ │ ├── asa159.hpp
│ │ ├── bindings.cpp
│ │ └── simulation.hpp
│ ├── simulation.py
│ ├── statistics.py
│ └── utils.py
├── pyproject.toml
└── tests/
├── integration/
│ ├── test_phik_tutorial_advanced.py
│ └── test_phik_tutorial_basic.py
└── test_phik.py
SYMBOL INDEX (118 symbols across 21 files)
FILE: docs/source/conf.py
function skip (line 165) | def skip(app, what, name, obj, skip, options):
function setup (line 171) | def setup(app):
FILE: phik/betainc.py
function contfractbeta (line 20) | def contfractbeta(
function incompbeta (line 70) | def incompbeta(a: float, b: float, x: float) -> float:
function log_incompbeta (line 101) | def log_incompbeta(a: float, b: float, x: float) -> Tuple[float, float]:
FILE: phik/binning.py
function bin_edges (line 26) | def bin_edges(
function bin_array (line 52) | def bin_array(
function bin_data (line 87) | def bin_data(
function auto_bin_data (line 149) | def auto_bin_data(
function create_correlation_overview_table (line 186) | def create_correlation_overview_table(
function hist2d_from_rebinned_df (line 210) | def hist2d_from_rebinned_df(
function hist2d (line 246) | def hist2d(
function hist2d_from_array (line 296) | def hist2d_from_array(
FILE: phik/bivariate.py
function _mvn_un (line 33) | def _mvn_un(rho: float, lower: tuple, upper: tuple,
function _calc_mvnun (line 50) | def _calc_mvnun(lower, upper, mu, S, rng = np.random.default_rng(42)):
function _mvn_array (line 58) | def _mvn_array(rho: float, sx: np.ndarray, sy: np.ndarray) -> list:
function bivariate_normal_theory (line 102) | def bivariate_normal_theory(
function chi2_from_phik (line 146) | def chi2_from_phik(
function phik_from_chi2 (line 210) | def phik_from_chi2(
FILE: phik/data_quality.py
function dq_check_nunique_values (line 24) | def dq_check_nunique_values(
function dq_check_hist2d (line 98) | def dq_check_hist2d(hist2d: np.ndarray) -> bool:
FILE: phik/entry_points.py
function phik_trial (line 17) | def phik_trial():
FILE: phik/outliers.py
function poisson_obs_p (line 33) | def poisson_obs_p(nobs: int, nexp: float, nexperr: float) -> float:
function log_poisson_obs_p (line 76) | def log_poisson_obs_p(nobs: int, nexp: float, nexperr: float) -> Tuple[f...
function poisson_obs_z (line 122) | def poisson_obs_z(nobs: int, nexp: float, nexperr: float) -> float:
function poisson_obs_mid_p (line 155) | def poisson_obs_mid_p(nobs: int, nexp: float, nexperr: float) -> float:
function log_poisson_obs_mid_p (line 177) | def log_poisson_obs_mid_p(
function poisson_obs_mid_z (line 212) | def poisson_obs_mid_z(nobs: int, nexp: float, nexperr: float) -> float:
function get_independent_frequency_estimates (line 246) | def get_independent_frequency_estimates(
function get_uncertainty (line 288) | def get_uncertainty(x: float, CI_method: str = "poisson") -> float:
function get_poisson_uncertainty (line 312) | def get_poisson_uncertainty(x: float) -> float:
function get_exact_poisson_uncertainty (line 323) | def get_exact_poisson_uncertainty(x: float, nsigmas: float = 1) -> float:
function get_outlier_significances (line 350) | def get_outlier_significances(
function outlier_significance_matrix_from_hist2d (line 376) | def outlier_significance_matrix_from_hist2d(
function outlier_significance_matrix_from_rebinned_df (line 395) | def outlier_significance_matrix_from_rebinned_df(
function outlier_significance_matrix (line 467) | def outlier_significance_matrix(
function outlier_significance_matrices_from_rebinned_df (line 530) | def outlier_significance_matrices_from_rebinned_df(
function outlier_significance_matrices (line 584) | def outlier_significance_matrices(
function outlier_significance_from_array (line 653) | def outlier_significance_from_array(
function outlier_significance_from_binned_array (line 707) | def outlier_significance_from_binned_array(
FILE: phik/phik.py
function spark_phik_matrix_from_hist2d_dict (line 35) | def spark_phik_matrix_from_hist2d_dict(spark_context, hist_dict: dict):
function _phik_from_row (line 61) | def _phik_from_row(row: Tuple[str, np.ndarray]) -> Tuple[str, str, float]:
function phik_from_hist2d (line 79) | def phik_from_hist2d(
function phik_from_rebinned_df (line 117) | def phik_from_rebinned_df(
function _calc_phik (line 185) | def _calc_phik(
function phik_matrix (line 211) | def phik_matrix(
function global_phik_from_rebinned_df (line 266) | def global_phik_from_rebinned_df(
function global_phik_array (line 329) | def global_phik_array(
function phik_from_array (line 394) | def phik_from_array(
function phik_from_binned_array (line 447) | def phik_from_binned_array(
function phik_observed_vs_expected_from_rebinned_df (line 498) | def phik_observed_vs_expected_from_rebinned_df(
function _calc_phik_obs_vs_exp (line 579) | def _calc_phik_obs_vs_exp(
FILE: phik/report.py
function plot_hist_and_func (line 33) | def plot_hist_and_func(
function plot_correlation_matrix (line 101) | def plot_correlation_matrix(
function correlation_report (line 276) | def correlation_report(
FILE: phik/resources.py
function _resource (line 34) | def _resource(resource_type, name: str) -> str:
function fixture (line 55) | def fixture(name: str) -> str:
function notebook (line 66) | def notebook(name: str) -> str:
FILE: phik/significance.py
function fit_test_statistic_distribution (line 36) | def fit_test_statistic_distribution(
function hfunc (line 77) | def hfunc(x: float, N: float, f: float, k: float, sigma: float) -> float:
function significance_from_chi2_ndof (line 93) | def significance_from_chi2_ndof(chi2: float, ndof: float) -> Tuple[float...
function significance_from_chi2_asymptotic (line 117) | def significance_from_chi2_asymptotic(
function significance_from_chi2_MC (line 136) | def significance_from_chi2_MC(
function significance_from_chi2_hybrid (line 173) | def significance_from_chi2_hybrid(
function significance_from_hist2d (line 238) | def significance_from_hist2d(
function significance_from_rebinned_df (line 294) | def significance_from_rebinned_df(
function significance_matrix (line 374) | def significance_matrix(
function significance_from_array (line 434) | def significance_from_array(
function significance_from_binned_array (line 497) | def significance_from_binned_array(
FILE: phik/simcore/__init__.py
function _sim_2d_data_patefield (line 15) | def _sim_2d_data_patefield(*args, **kwargs):
FILE: phik/simcore/asa159.cpp
function i4_max (line 24) | int i4_max ( int i1, int i2 )
function i4_min (line 65) | int i4_min ( int i1, int i2 )
function i4mat_print (line 106) | void i4mat_print ( int m, int n, int a[], string title )
function i4mat_print_some (line 147) | void i4mat_print_some ( int m, int n, int a[], int ilo, int jlo, int ihi,
function i4vec_print (line 274) | void i4vec_print ( int n, int a[], string title )
function i4vec_sum (line 321) | int i4vec_sum ( int n, int a[] )
function r8_uniform_01 (line 377) | double r8_uniform_01 ( int *seed )
function rcont2 (line 442) | void rcont2 ( int nrow, int ncol, int nrowt[], int ncolt[], bool *key,
function timestamp (line 781) | void timestamp ( )
FILE: phik/simcore/bindings.cpp
function PYBIND11_MODULE (line 4) | PYBIND11_MODULE(_phik_simulation_core, m) { bind_simulation(m); }
FILE: phik/simcore/simulation.hpp
type simulation_error (line 18) | struct simulation_error: std::exception {
method simulation_error (line 20) | explicit simulation_error(const char* message) : p_message(message) {}
function _sim_2d_data_patefield (line 24) | void _sim_2d_data_patefield(
function bind_simulation (line 74) | void bind_simulation(py::module &m) {
FILE: phik/simulation.py
function sim_2d_data (line 27) | def sim_2d_data(hist:np.ndarray, ndata:int=0) -> np.ndarray:
function sim_2d_data_patefield (line 51) | def sim_2d_data_patefield(data: np.ndarray, seed : int = None) -> np.nda...
function sim_2d_product_multinominal (line 90) | def sim_2d_product_multinominal(data:np.ndarray, axis: int) -> np.ndarray:
function sim_data (line 108) | def sim_data(data:np.ndarray, method:str='multinominal') -> np.ndarray:
function sim_chi2_distribution (line 138) | def sim_chi2_distribution(values: np.ndarray, nsim:int=1000, lambda_:str...
function _simulate_and_fit (line 163) | def _simulate_and_fit(exp_dep: np.ndarray, simulation_method: str='multi...
FILE: phik/statistics.py
function get_dependent_frequency_estimates (line 22) | def get_dependent_frequency_estimates(vals: np.ndarray) -> np.ndarray:
function get_chi2_using_dependent_frequency_estimates (line 36) | def get_chi2_using_dependent_frequency_estimates(
function get_pearson_chi_square (line 62) | def get_pearson_chi_square(
function estimate_ndof (line 95) | def estimate_ndof(chi2values: Union[list, np.ndarray]) -> float:
function estimate_simple_ndof (line 109) | def estimate_simple_ndof(observed: np.ndarray) -> int:
function theoretical_ndof (line 135) | def theoretical_ndof(observed: np.ndarray) -> int:
function z_from_logp (line 149) | def z_from_logp(logp: float, flip_sign: bool = False) -> float:
FILE: phik/utils.py
function array_like_to_dataframe (line 7) | def array_like_to_dataframe(
function guess_interval_cols (line 22) | def guess_interval_cols(df: pd.DataFrame, verbose: bool = False) -> list:
function make_shapes_equal (line 38) | def make_shapes_equal(observed: pd.DataFrame, expected: pd.DataFrame) ->...
FILE: tests/integration/test_phik_tutorial_advanced.py
function test_advanced_notebook (line 24) | def test_advanced_notebook():
FILE: tests/integration/test_phik_tutorial_basic.py
function test_basic_notebook (line 29) | def test_basic_notebook():
FILE: tests/test_phik.py
class PhiKTest (line 30) | class PhiKTest(unittest.TestCase):
method test_phik_calculation (line 33) | def test_phik_calculation(self):
method test_phik_from_hist2d (line 42) | def test_phik_from_hist2d(self):
method test_phik_observed_vs_expected_from_hist2d (line 56) | def test_phik_observed_vs_expected_from_hist2d(self):
method test_phik_matrix (line 72) | def test_phik_matrix(self):
method test_phik_matrix_observed_vs_expected (line 103) | def test_phik_matrix_observed_vs_expected(self):
method test_global_phik (line 139) | def test_global_phik(self):
method test_significance_matrix_asymptotic (line 157) | def test_significance_matrix_asymptotic(self):
method test_significance_matrix_hybrid (line 194) | def test_significance_matrix_hybrid(self):
method test_significance_matrix_mc (line 235) | def test_significance_matrix_mc(self):
method test_hist2d (line 260) | def test_hist2d(self):
method test_hist2d_array (line 274) | def test_hist2d_array(self):
method test_outlier_significance_matrix (line 286) | def test_outlier_significance_matrix(self):
method test_outlier_significance_matrices (line 300) | def test_outlier_significance_matrices(self):
method test_simulation_2d_patefield (line 313) | def test_simulation_2d_patefield(self):
method test_binning_bin_data_bins_tyes (line 326) | def test_binning_bin_data_bins_tyes(self):
Condensed preview — 57 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,084K chars).
[
{
"path": ".gitattributes",
"chars": 35,
"preview": "phik/notebooks/* linguist-vendored\n"
},
{
"path": ".github/dependabot.yml",
"chars": 307,
"preview": "---\nversion: 2\nupdates:\n - package-ecosystem: pip\n directory: /\n # Check for updates once a day\n schedule:\n "
},
{
"path": ".github/workflows/test_matrix.yml",
"chars": 974,
"preview": "name: Test Matrix\n\non:\n workflow_dispatch:\n pull_request:\n push:\n branches:\n - master\n\njobs:\n build:\n nam"
},
{
"path": ".github/workflows/tests.yml",
"chars": 501,
"preview": "name: Test\n\non: push\njobs:\n tests:\n runs-on: ubuntu-latest\n\n steps:\n - uses: actions/checkout@v6\n - name: S"
},
{
"path": ".github/workflows/valgrind.yml",
"chars": 1722,
"preview": "name: Valgrind\n\non:\n pull_request:\n branches:\n - master\n workflow_dispatch:\n\ndefaults:\n run:\n shell: bash\n"
},
{
"path": ".github/workflows/wheels.yml",
"chars": 1974,
"preview": "name: Wheels\n\non:\n workflow_dispatch:\n pull_request:\n push:\n branches:\n - master\n release:\n types:\n -"
},
{
"path": ".gitignore",
"chars": 16,
"preview": "*.so\n*egg-info*\n"
},
{
"path": ".mbuild.sh",
"chars": 434,
"preview": "cmake -S . -G Ninja -B build \\\n -DCMAKE_BUILD_TYPE=Release \\\n -DSKBUILD_PROJECT_NAME=\"phik\" \\\n -DSKBUILD_PROJEC"
},
{
"path": ".readthedocs.yml",
"chars": 328,
"preview": "# Read the Docs configuration file\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details\n# .readth"
},
{
"path": "CHANGES.rst",
"chars": 3840,
"preview": "=============\nRelease notes\n=============\n\nVersion 0.12.5, Jul 2025\n------------------------\n\n- FIX: scipy 1.16.0 no lon"
},
{
"path": "CMakeLists.txt",
"chars": 1855,
"preview": "cmake_minimum_required(VERSION 3.17...3.27)\n\n# Scikit-build-core sets these values for you, or you can just hard-code th"
},
{
"path": "LICENSE",
"chars": 786,
"preview": "##############################################################################\n#\n# Copyright 2016 KPMG Advisory N.V. (un"
},
{
"path": "NOTICE",
"chars": 898,
"preview": "################################################################################################\n#\n# NOTICE: pass-throug"
},
{
"path": "README.rst",
"chars": 4719,
"preview": "==========================\nPhi_K Correlation Constant\n==========================\n\n* Version: 0.12.5. Released: Jul 2025\n"
},
{
"path": "docs/Makefile",
"chars": 7450,
"preview": "# Makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS =\nSPHINXBUILD "
},
{
"path": "docs/README.rst",
"chars": 2533,
"preview": "Generating Documentation with Sphinx\n====================================\n\nThis README is for generating and writing doc"
},
{
"path": "docs/autogenerate.sh",
"chars": 294,
"preview": "#!/bin/bash\n\n# (re)create required directories\nrm -rf autogen\nmkdir -p source/_static autogen\n\n# auto-generate code docu"
},
{
"path": "docs/source/code.rst",
"chars": 81,
"preview": "API Documentation\n=================\n\n.. toctree::\n :maxdepth: 2\n\n phik_index\n"
},
{
"path": "docs/source/conf.py",
"chars": 5415,
"preview": "# -*- coding: utf-8 -*-\n#\n# PhiK documentation build configuration file for sphinx.\n#\n#\n\nimport os\n#from unittest.mock i"
},
{
"path": "docs/source/developing.rst",
"chars": 1273,
"preview": "===========================\nDeveloping and Contributing\n===========================\n\n\nWorking on the package\n-----------"
},
{
"path": "docs/source/index.rst",
"chars": 479,
"preview": ".. PhiK documentation master file, created by\n sphinx-quickstart on Thu Jul 7 14:25:54 2016.\n You can adapt this fi"
},
{
"path": "docs/source/introduction.rst",
"chars": 3360,
"preview": "======================\nWhy did we build this?\n======================\n\nWhen exploring a data set, for example to model on"
},
{
"path": "docs/source/phik.decorators.rst",
"chars": 353,
"preview": "phik.decorators package\n=======================\n\nSubmodules\n----------\n\nphik.decorators.pandas module\n------------------"
},
{
"path": "docs/source/phik.rst",
"chars": 2096,
"preview": "phik package\n============\n\nSubpackages\n-----------\n\n.. toctree::\n\n phik.decorators\n\nSubmodules\n----------\n\nphik.betai"
},
{
"path": "docs/source/phik_index.rst",
"chars": 49,
"preview": "PhiK\n====\n\n.. toctree::\n :maxdepth: 4\n\n phik\n"
},
{
"path": "docs/source/publication.rst",
"chars": 1080,
"preview": "===================\nPublication & Talks\n===================\n\nPublication\n-----------\n\n* peer-reviewed: https://www.scien"
},
{
"path": "docs/source/tutorials.rst",
"chars": 1307,
"preview": "=========\nTutorials\n=========\n\nThis section contains materials on how to use the Phi_K correlation analysis code.\nThere "
},
{
"path": "example.py",
"chars": 952,
"preview": "import pandas as pd\n\nimport phik\nfrom phik import resources, report\n\n# open fake car insurance data\ndf = pd.read_csv( re"
},
{
"path": "phik/__init__.py",
"chars": 669,
"preview": "# flake8: noqa\nimport importlib.metadata\n\nfrom phik import decorators\nfrom phik.outliers import (\n outlier_significan"
},
{
"path": "phik/betainc.py",
"chars": 3969,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/09/05\n\nDescription:\n Implementation of incomplete beta"
},
{
"path": "phik/binning.py",
"chars": 11242,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/09/06\n\nDescription:\n A set of rebinning functions, to "
},
{
"path": "phik/bivariate.py",
"chars": 9557,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2019/11/23\n\nDescription:\n Convert Pearson correlation value"
},
{
"path": "phik/data_quality.py",
"chars": 5148,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/12/28\n\nDescription:\n A set of functions to check for d"
},
{
"path": "phik/decorators/__init__.py",
"chars": 88,
"preview": "# flake8: noqa\n\n# import pandas DataFrame decorators\nfrom phik.decorators import pandas\n"
},
{
"path": "phik/decorators/pandas.py",
"chars": 1263,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nModule: phik.decorators.pandas\n\nCreated: 2018/11/14\n\nDescription:\n D"
},
{
"path": "phik/definitions.py",
"chars": 502,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/09/05\n\nDescription:\n Definitions used throughout the p"
},
{
"path": "phik/entry_points.py",
"chars": 833,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/11/13\n\nDescription:\n Collection of phik entry points\n\n"
},
{
"path": "phik/notebooks/phik_tutorial_advanced.ipynb",
"chars": 70595,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Phi_K advanced tutorial\\n\",\n \""
},
{
"path": "phik/notebooks/phik_tutorial_basic.ipynb",
"chars": 728762,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Phi_K basic tutorial\\n\",\n \"\\n\""
},
{
"path": "phik/notebooks/phik_tutorial_spark.ipynb",
"chars": 5182,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Phi_K spark tutorial\\n\",\n \"\\n\""
},
{
"path": "phik/outliers.py",
"chars": 29263,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/09/05\n\nDescription:\n Functions for calculating the sta"
},
{
"path": "phik/phik.py",
"chars": 24975,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/09/05\n\nDescription:\n Functions for the Phik correlatio"
},
{
"path": "phik/report.py",
"chars": 18453,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/09/06\n\nDescription:\n Functions to create nice correlat"
},
{
"path": "phik/resources.py",
"chars": 2169,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/11/13\n\nDescription:\n Collection of helper functions to"
},
{
"path": "phik/significance.py",
"chars": 21114,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/09/05\n\nDescription:\n Functions for doing the significa"
},
{
"path": "phik/simcore/__init__.py",
"chars": 522,
"preview": "import importlib.util\n\ntry:\n _ext_spec = importlib.util.find_spec(\"phik.lib._phik_simulation_core\")\nexcept ModuleNotF"
},
{
"path": "phik/simcore/asa159.cpp",
"chars": 15402,
"preview": "/*\n * Taken from:\n * https://people.sc.fsu.edu/~jburkardt/cpp_src/asa159/asa159.html\n *\n * Michael Patefield,\n * Algorit"
},
{
"path": "phik/simcore/asa159.hpp",
"chars": 916,
"preview": "/*\n * Taken from:\n * https://people.sc.fsu.edu/~jburkardt/cpp_src/asa159/asa159.html\n *\n * Michael Patefield,\n * Algorit"
},
{
"path": "phik/simcore/bindings.cpp",
"chars": 124,
"preview": "#include \"simulation.hpp\"\n#include <pybind11/pybind11.h>\n\nPYBIND11_MODULE(_phik_simulation_core, m) { bind_simulation(m)"
},
{
"path": "phik/simcore/simulation.hpp",
"chars": 2535,
"preview": "/* python/phik/simulation/simulation.hpp wrapper and bindings for\n * Michael Patefield,\n * Algorithm AS 159: An Efficien"
},
{
"path": "phik/simulation.py",
"chars": 6526,
"preview": "\"\"\"Project: PhiK - correlation analyzer library\n\nCreated: 2018/09/05\n\nDescription:\n Helper functions to simulate 2D d"
},
{
"path": "phik/statistics.py",
"chars": 5524,
"preview": "\"\"\"Project: PhiK - correlation coefficient package\n\nCreated: 2018/09/05\n\nDescription:\n Statistics helper functions, f"
},
{
"path": "phik/utils.py",
"chars": 2928,
"preview": "from typing import Union\n\nimport pandas as pd\nimport numpy as np\n\n\ndef array_like_to_dataframe(\n x: Union[pd.Series, "
},
{
"path": "pyproject.toml",
"chars": 1177,
"preview": "[build-system]\nrequires = [\"scikit-build-core>=0.3.3\", \"pybind11\"]\nbuild-backend = \"scikit_build_core.build\"\n\n[project]\n"
},
{
"path": "tests/integration/test_phik_tutorial_advanced.py",
"chars": 13863,
"preview": "# # Phi_K advanced tutorial\n#\n# This notebook guides you through the more advanced functionality of the phik package. Th"
},
{
"path": "tests/integration/test_phik_tutorial_basic.py",
"chars": 13548,
"preview": "# # Phi_K basic tutorial\n#\n# This notebook guides you through the basic functionality of the phik package. The package o"
},
{
"path": "tests/test_phik.py",
"chars": 11976,
"preview": "\"\"\"Project: Phi_K - correlation coefficient package\n\nCreated: 2018/11/13\n\nDescription:\n Collection of helper function"
}
]
About this extraction
This page contains the full source code of the KaveIO/PhiK GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 57 files (1.0 MB), approximately 561.3k tokens, and a symbol index with 118 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.