Repository: krishnanlab/PecanPy Branch: master Commit: 743196280f33 Files: 36 Total size: 116.0 KB Directory structure: gitextract_5ev1jrt4/ ├── .bumpversion.cfg ├── .github/ │ ├── dependabot.yml │ └── workflows/ │ ├── release.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── LICENSE ├── README.md ├── demo/ │ ├── karate.edg │ ├── reproducibility.sh │ └── run_pecanpy ├── docs/ │ ├── Makefile │ ├── requirements.txt │ └── source/ │ ├── conf.py │ ├── index.rst │ └── pecanpy.rst ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py ├── src/ │ └── pecanpy/ │ ├── __init__.py │ ├── cli.py │ ├── experimental.py │ ├── graph.py │ ├── pecanpy.py │ ├── rw/ │ │ ├── __init__.py │ │ ├── dense_rw.py │ │ └── sparse_rw.py │ ├── typing.py │ └── wrappers.py ├── test/ │ ├── test_cli.py │ ├── test_graph.py │ ├── test_pecanpy.py │ └── test_walk.py └── tox.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: .bumpversion.cfg ================================================ [bumpversion] current_version = 2.0.10-dev tag = False commit = True message = bump version: {current_version} -> {new_version} parse = (?P\d+)\.(?P\d+)\.(?P\d+)([-](?P(dev|stable)+)(?P\d*))? serialize = {major}.{minor}.{patch}-{release}{build} {major}.{minor}.{patch}-{release} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = stable values = dev stable [bumpversion:file:setup.cfg] search = version = {current_version} replace = version = {new_version} [bumpversion:file:src/pecanpy/__init__.py] search = __version__ = "{current_version}" replace = version = "{new_version}" [bumpversion:file:docs/source/conf.py] search = release = "{current_version}" replace = release = "{new_version}" ================================================ FILE: .github/dependabot.yml ================================================ # To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/" # Location of package manifests schedule: interval: "daily" ignore: - dependency-name: "numpy" versions: ["1.22.x"] # Numba 0.55.1 do not support numpy 1.22.x yet https://github.com/numba/numba/issues/7754 ================================================ FILE: .github/workflows/release.yml ================================================ name: Release Package on: release: types: [created] jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 - name: Install dependencies run: | python -m pip install --upgrade pip pip install setuptools wheel twine - name: Build and publish env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | python setup.py sdist bdist_wheel twine upload dist/* ================================================ FILE: .github/workflows/tests.yml ================================================ name: Tests on: - push - pull_request jobs: test: runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, windows-latest] python-version: ['3.8', '3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install tox tox-gh-actions - name: Test with tox run: tox ================================================ FILE: .gitignore ================================================ # vim buffer *.swp # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # IDEA .idea/ ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: - id: trailing-whitespace exclude: .bumpversion.cfg - id: end-of-file-fixer - repo: https://github.com/asottile/reorder-python-imports rev: v3.12.0 hooks: - id: reorder-python-imports args: ["--py38-plus"] - repo: https://github.com/asottile/add-trailing-comma rev: v3.1.0 hooks: - id: add-trailing-comma - repo: https://github.com/asottile/pyupgrade rev: v3.15.0 hooks: - id: pyupgrade - repo: https://github.com/psf/black rev: 23.12.1 hooks: - id: black args: [--safe] ================================================ FILE: .readthedocs.yml ================================================ # Read the Docs configuration file version: 2 build: os: ubuntu-22.04 tools: python: "3.8" sphinx: configuration: docs/source/conf.py python: install: - requirements: docs/requirements.txt - requirements: requirements.txt ================================================ FILE: LICENSE ================================================ BSD 3-Clause License Copyright (c) 2020-2021, Krishnan Laboratory, Michigan State University. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6386437.svg)](https://doi.org/10.5281/zenodo.6386437) [![Documentation Status](https://readthedocs.org/projects/pecanpy/badge/?version=latest)](https://pecanpy.readthedocs.io/en/latest/?badge=latest) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Tests](https://github.com/krishnanlab/PecanPy/actions/workflows/tests.yml/badge.svg)](https://github.com/krishnanlab/PecanPy/actions/workflows/tests.yml) # PecanPy: A parallelized, efficient, and accelerated _node2vec(+)_ in Python Learning low-dimensional representations (embeddings) of nodes in large graphs is key to applying machine learning on massive biological networks. _Node2vec_ is the most widely used method for node embedding. PecanPy is a fast, parallelized, memory efficient, and cache optimized Python implementation of [_node2vec_](https://github.com/aditya-grover/node2vec). It uses cache-optimized compact graph data structures and precomputing/parallelization to result in fast, high-quality node embeddings for biological networks of all sizes and densities. Detailed source code documentation can be found [here](https://pecanpy.readthedocs.io/). The details of implementation and the optimizations, along with benchmarks, are described in the application note [_PecanPy: a fast, efficient and parallelized Python implementation of node2vec_](https://doi.org/10.1093/bioinformatics/btab202), which is published in _Bioinformatics_. The benchmarking results presented in the preprint can be reproduced using the test scripts provided in the companion [benchmarks repo](https://github.com/krishnanlab/PecanPy_benchmarks). **v2 update**: PecanPy is now equipped with _node2vec+_, which is a natural extension of _node2vec_ and handles weighted graph more effectively. For more information, see [*Accurately Modeling Biased Random Walks on Weighted Graphs Using Node2vec+*](https://arxiv.org/abs/2109.08031). The datasets and test scripts for reproducing the presented results are available in the [node2vec+ benchmarks repo](https://github.com/krishnanlab/node2vecplus_benchmarks). ## Installation Install from the latest release with: ```bash $ pip install pecanpy ``` Install latest version (unreleassed) in development mode with: ```bash $ git clone https://github.com/krishnanlab/pecanpy.git $ cd pecanpy $ pip install -e . ``` where `-e` means "editable" mode so you don't have to reinstall every time you make changes. PecanPy installs a command line utility `pecanpy` that can be used directly. ## Usage PecanPy operates in three different modes – `PreComp`, `SparseOTF`, and `DenseOTF` – that are optimized for networks of different sizes and densities; `PreComp` for networks that are small (≤10k nodes; any density), `SparseOTF` for networks that are large and sparse (>10k nodes; ≤10% of edges), and `DenseOTF` for networks that are large and dense (>10k nodes; >10% of edges). These modes appropriately take advantage of compact/dense graph data structures, precomputing transition probabilities, and computing 2nd-order transition probabilities during walk generation to achieve significant improvements in performance. ### Example To run *node2vec* on Zachary's karate club network using `SparseOTF` mode, execute the following command from the project home directory: ```bash pecanpy --input demo/karate.edg --output demo/karate.emb --mode SparseOTF ``` ### Node2vec+ To enable _node2vec+_, specify the `--extend` option. ```bash pecanpy --input demo/karate.edge --output demo/karate_n2vplus.emb --mode SparseOTF --extend ``` **Note**: _node2vec+_ is only beneficial for embedding _weighted_ graphs. For unweighted graphs, _node2vec+_ is equivalent to _node2vec_. The above example only serves as a demonstration of enabling _node2vec+_. ### Demo Execute the following command for full demonstration: ```bash sh demo/run_pecanpy ``` ### Mode As mentioned above, PecanPy contains three main modes for generating node2vec random walks, each of which is better optimized for different network sizes/densities: | Mode | Network size/density | Optimization | |:-----|:---------------------|:-------------| | `PreComp` | <10k nodes, <0.1% edges | Precompute second order transition probabilities, using CSR graph | | `SparseOTF` (default) | (≥10k nodes, ≥0.1% and <20% of edges) or (<10k nodes, ≥0.1% edges) | Transition probabilites computed on-the-fly, using CSR graph | | `DenseOTF` | >20% of edges | Transition probabilities computed on-the-fly, using dense matrix | #### Compatibility and recommendations | Mode | Weighted | ``p,q!=1`` | Node2vec+ | Speed | Use this if | |:-----|----------------|---------------|-----------|:------------|:--------| |``PreComp``|:white_check_mark:|:white_check_mark:|:white_check_mark:|:dash::dash:|The graph is small and sparse| |``SparseOTF``|:white_check_mark:|:white_check_mark:|:white_check_mark:|:dash:|The graph is sparse but not necessarily small| |``DenseOTF``|:white_check_mark:|:white_check_mark:|:white_check_mark:|:dash:|The graph is extremely dense| |``PreCompFirstOrder``|:white_check_mark:|:x:|:x:|:dash::dash:|Run with ``p = q = 1`` on weighted graph| |``FirstOrderUnweighted``|:x:|:x:|:x:|:dash::dash::dash:|Run with ``p = q = 1`` on unweighted graph| ### Options Check out the full list of options available using: ```bash pecanpy --help ``` ### Input The supported input is a network file as an edgelist `.edg` file (node id could be int or string): ``` node1_id node2_id ``` Another supported input format (only for `DenseOTF`) is the numpy array `.npz` file. Run the following command to prepare a `.npz` file from a `.edg` file. ```bash pecanpy --input $input_edgelist --output $output_npz --task todense ``` The default delimiter for `.edg` is tab space (`\t`), you many change this by passing in the `--delimiter` option. ### Output The output file has *n+1* lines for graph with *n* vertices, with a header line of the following format: ``` num_of_nodes dim_of_representation ``` The following next *n* lines are the representations of dimension *d* following the corresponding node ID: ``` node_id dim_1 dim_2 ... dim_d ``` ### Development Note Run `black src/pecanpy/` to automatically follow black code formatting. Run `tox -e flake8` and resolve suggestions before committing to ensure consistent code style. ## Additional Information ### Documentation Detailed documentation for PecanPy is available [here](https://pecanpy.readthedocs.io/). ### Support For support, please consider opening a GitHub issue and we will do our best to reply in a timely manner. Alternatively, if you would like to keep the conversation private, feel free to contact [Remy Liu](https://twitter.com/RemyLau3) at liurenmi@msu.edu. ### License This repository and all its contents are released under the [BSD 3-Clause License](https://opensource.org/licenses/BSD-3-Clause); See [LICENSE.md](https://github.com/krishnanlab/pecanpy/blob/master/LICENSE.md). ### Citation If you use PecanPy, please cite: Liu R, Krishnan A (2021) **PecanPy: a fast, efficient, and parallelized Python implementation of _node2vec_.** _Bioinformatics_ https://doi.org/10.1093/bioinformatics/btab202 If you find _node2vec+_ useful, please cite: Liu R, Hirn M, Krishnan A (2023) **Accurately modeling biased random walks on weighted graphs using _node2vec+_.** _Bioinformatics_ https://doi.org/10.1093/bioinformatics/btad047 ### Authors Renming Liu, Arjun Krishnan* >\*General correspondence should be addressed to AK at arjun.krishnan@cuanschutz.edu. ### Funding This work was primarily supported by US National Institutes of Health (NIH) grants R35 GM128765 to AK and in part by MSU start-up funds to AK. ### Acknowledgements We thank [Christopher A. Mancuso](https://github.com/ChristopherMancuso), [Anna Yannakopoulos](http://yannakopoulos.com/), and the rest of the [Krishnan Lab](https://www.thekrishnanlab.org/team) for valuable discussions and feedback on the software and manuscript. Thanks to [Charles T. Hoyt](https://github.com/cthoyt) for making the software `pip` installable and for an extensive code review. ### References **Original _node2vec_** * Grover, A. and Leskovec, J. (2016) node2vec: Scalable Feature Learning for Networks. ArXiv160700653 Cs Stat. Original _node2vec_ software and networks * https://snap.stanford.edu/node2vec/ contains the original software and the networks (PPI, BlogCatalog, and Wikipedia) used in the original study (Grover and Leskovec, 2016). **Other networks** * Stark, C. et al. (2006) BioGRID: a general repository for interaction datasets. Nucleic Acids Res., 34, D535–D539. * BioGRID human protein-protein interactions. * Szklarczyk, D. et al. (2015) STRING v10: protein–protein interaction networks, integrated over the tree of life. Nucleic Acids Res., 43, D447–D452. * STRING predicted human gene interactions. * Greene, C.S. et al. (2015) Understanding multicellular function and disease with human tissue-specific networks. Nat. Genet., 47, 569–576. * GIANT-TN is a generic genome-scale human gene network. GIANT-TN-c01 is a sub-network of GIANT-TN where edges with edge weight below 0.01 are discarded. BioGRID (Stark et al., 2006), STRING (Szklarczyk et al., 2015), and GIANT-TN (Greene et al., 2015) are available from https://doi.org/10.5281/zenodo.3352323. * Law, J.N. et al. (2019) Accurate and Efficient Gene Function Prediction using a Multi-Bacterial Network. bioRxiv, 646687. * SSN200 is a cross-species network of proteins from 200 species with the edges representing protein sequence similarities. Downloaded from https://bioinformatics.cs.vt.edu/~jeffl/supplements/2019-fastsinksource/. ================================================ FILE: demo/karate.edg ================================================ 1 32 1 22 1 20 1 18 1 14 1 13 1 12 1 11 1 9 1 8 1 7 1 6 1 5 1 4 1 3 1 2 2 31 2 22 2 20 2 18 2 14 2 8 2 4 2 3 3 14 3 9 3 10 3 33 3 29 3 28 3 8 3 4 4 14 4 13 4 8 5 11 5 7 6 17 6 11 6 7 7 17 9 34 9 33 9 33 10 34 14 34 15 34 15 33 16 34 16 33 19 34 19 33 20 34 21 34 21 33 23 34 23 33 24 30 24 34 24 33 24 28 24 26 25 32 25 28 25 26 26 32 27 34 27 30 28 34 29 34 29 32 30 34 30 33 31 34 31 33 32 34 32 33 33 34 ================================================ FILE: demo/reproducibility.sh ================================================ #!/bin/bash --login # reproducibility.sh # Test the reproducibility of PecanPy between runs. source ~/.bashrc rs=100 export PYTHONHASHSEED=$rs conda activate pecanpy-dev pecanpy --input karate.edg --output karate1.emd --mode FirstOrderUnweighted --workers 1 --random_state $rs pecanpy --input karate.edg --output karate2.emd --mode FirstOrderUnweighted --workers 1 --random_state $rs cmp karate1.emd karate2.emd rm -f karate1.emd karate2.emd ================================================ FILE: demo/run_pecanpy ================================================ #!/bin/bash cd $(dirname $(realpath $0)) cd ../ set -v # run with PreComp mode (default) pecanpy --input demo/karate.edg --output demo/karate.emb --verbose # run with SparseOTF mode pecanpy --input demo/karate.edg --output demo/karate.emb --verbose --mode SparseOTF # run with DenseOTF mode pecanpy --input demo/karate.edg --output demo/karate.emb --verbose --mode DenseOTF # convert and save edgelist as dense matrix pecanpy --input demo/karate.edg --output demo/karate.npz --task todense # run with DenseOTF mode using dense array as input pecanpy --input demo/karate.npz --output demo/karate.emb --verbose --mode DenseOTF # input parameters pecanpy --help ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/requirements.txt ================================================ sphinx sphinx_rtd_theme ================================================ FILE: docs/source/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys sys.path.insert(0, os.path.abspath("../../src")) # -- Project information ----------------------------------------------------- project = "PecanPy" copyright = "2020, Renming Liu and Arjun Krishnan" author = "Renming Liu and Arjun Krishnan" # The full version, including alpha/beta/rc tags release = "2.0.10-dev" # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.todo", "sphinx.ext.coverage", "sphinx.ext.viewcode", "sphinx.ext.napoleon", ] # Napoleon settings napoleon_google_docstring = True # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "python": ("https://docs.python.org/3", None), "networkx": ("https://networkx.github.io/documentation/latest/", None), } autodoc_member_order = "bysource" autoclass_content = "both" ================================================ FILE: docs/source/index.rst ================================================ Welcome to PecanPy's documentation ================================== .. toctree:: :maxdepth: 2 pecanpy ================================================ FILE: docs/source/pecanpy.rst ================================================ PecanPy package =============== Command line interface ---------------------- .. automodule:: pecanpy.cli :members: :undoc-members: :show-inheritance: Graph Data Structures --------------------- .. automodule:: pecanpy.graph :members: :undoc-members: :show-inheritance: Node2vec implementations ------------------------ .. automodule:: pecanpy.pecanpy :members: :undoc-members: :show-inheritance: ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["setuptools>=42.0", "wheel"] build-backend = "setuptools.build_meta" [tool.mypy] ignore_missing_imports = true follow_imports = "skip" plugins = [ "numpy.typing.mypy_plugin", ] ================================================ FILE: requirements.txt ================================================ gensim==4.3.2 nptyping==2.5.0 numba-progress==1.1.0 numba==0.58.1 numpy==1.23.2 scipy<1.13 # triu import issue (https://stackoverflow.com/a/78279318/12519564) typing_extensions==4.13.2 ================================================ FILE: setup.cfg ================================================ [metadata] name = pecanpy version = 2.0.10-dev author = Remy Liu author_email = liurenmi@msu.edu description = A parallelized, efficient, and accelerated node2vec long_description = file: README.md long_description_content_type = text/markdown # Links url = https://github.com/krishnanlab/PecanPy project_urls = Documentation = https://pecanpy.readthedocs.io/ # License license_files = file: LICENSE license = BSD 3-Clause License # Search tags classifiers = Development Status :: 5 - Production/Stable Programming Language :: Python Programming Language :: Python :: 3 :: Only Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 keywords = Network Embedding [options] install_requires = gensim>=4.1.0 numpy>=1.20.0 numba>=0.46.0 numba-progress>=0.0.2 nptyping>=2.0.0 typing_extensions>=4.0.1 zip_safe = false include_package_data = True python_requires = >=3.8 # Where is my code packages = find: package_dir = = src [options.extras_require] dev = bump2version==1.0.1 mypy==1.9.0 parameterized==0.9.0 pre-commit==3.5.0; python_version < "3.9" pre-commit==4.2.0; python_version >= "3.9" pytest-cov==5.0.0 pytest-xdist==3.6.1 pytest==8.3.5 tox==4.25.0 [options.packages.find] where = src [options.entry_points] console_scripts = pecanpy = pecanpy.cli:main ================================================ FILE: setup.py ================================================ """Setup module.""" import setuptools if __name__ == "__main__": setuptools.setup() ================================================ FILE: src/pecanpy/__init__.py ================================================ """PecanPy: parallelized, efficient, and accelerated node2vec.""" from . import graph from . import pecanpy version = "2.0.10-dev" __all__ = ["graph", "pecanpy"] ================================================ FILE: src/pecanpy/cli.py ================================================ """Command line utility for PecanPy. This is the command line interface for the ``pecanpy`` package. Examples: Run PecanPy in command line using ``PreComp`` mode to embed the karate network:: $ pecanpy --input demo/karate.edg --ouptut demo/karate.emb --mode PreComp Checkout the full list of parameters by:: $ pecanpy --help """ import argparse import warnings import numba import numpy as np from gensim.models import Word2Vec from . import graph from . import pecanpy from .wrappers import Timer def parse_args(): """Parse node2vec arguments.""" parser = argparse.ArgumentParser( description="Run pecanpy, a parallelized, efficient, and accelerated " "Python implementation of node2vec", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--input", required=True, help="Input graph (.edg or .npz) file path.", ) parser.add_argument( "--output", required=True, help="Output embeddings file path. Save as .npz file if the specified " "file path ends with .npz, otherwise save as a text file using the " "gensim save_word2vec_format method.", ) parser.add_argument( "--task", default="pecanpy", choices=["pecanpy", "tocsr", "todense"], help="Task to be performed.", ) parser.add_argument( "--mode", default="SparseOTF", choices=[ "DenseOTF", "FirstOrderUnweighted", "PreComp", "PreCompFirstOrder", "SparseOTF", ], help="PecanPy execution mode.", ) parser.add_argument( "--dimensions", type=int, default=128, help="Number of dimensions.", ) parser.add_argument( "--walk-length", type=int, default=80, help="Length of walk per source.", ) parser.add_argument( "--num-walks", type=int, default=10, help="Number of walks per source.", ) parser.add_argument( "--window-size", type=int, default=10, help="Context size for optimization.", ) parser.add_argument( "--epochs", type=int, default=1, help="Number of epochs in SGD when training Word2Vec", ) parser.add_argument( "--workers", type=int, default=0, help="Number of parallel workers (0 to use all available threads).", ) parser.add_argument( "--p", type=float, default=1, help="Return hyperparameter.", ) parser.add_argument( "--q", type=float, default=1, help="Inout hyperparameter.", ) parser.add_argument( "--weighted", action="store_true", help="Boolean specifying (un)weighted.", ) parser.add_argument( "--directed", action="store_true", help="Graph is (un)directed.", ) parser.add_argument( "--verbose", action="store_true", help="Print out training details", ) parser.add_argument( "--extend", action="store_true", help="Use node2vec+ extension", ) parser.add_argument( "--gamma", type=float, default=0, help="Noisy edge threshold parameter.", ) parser.add_argument( "--random_state", type=int, default=None, help="Random seed for generating random walks.", ) parser.add_argument( "--delimiter", type=str, default="\t", help="Delimiter used between node IDs.", ) parser.add_argument( "--implicit_ids", action="store_true", help="If set, use canonical node ordering for the node IDs.", ) return parser.parse_args() def check_mode(g, args): """Check mode selection. Give recommendation to user for pecanpy mode based on graph size and density. """ mode = args.mode weighted = args.weighted p = args.p q = args.q # Check unweighted first order random walk usage if mode == "FirstOrderUnweighted": if not p == q == 1 or weighted: raise ValueError( f"FirstOrderUnweighted only works when weighted = False and " f"p = q = 1, got {weighted=}, {p=}, {q=}", ) return if mode != "FirstOrderUnweighted" and p == q == 1 and not weighted: warnings.warn( "When p = 1 and q = 1 with unweighted graph, it is highly " f"recommended to use FirstOrderUnweighted over {mode} (current " "selection). The runtime could be improved greatly with improved " "memory usage.", stacklevel=2, ) return # Check first order random walk usage if mode == "PreCompFirstOrder": if not p == q == 1: raise ValueError( f"PreCompFirstOrder only works when p = q = 1, got {p=}, {q=}", ) return if mode != "PreCompFirstOrder" and p == 1 == q: warnings.warn( "When p = 1 and q = 1, it is highly recommended to use " f"PreCompFirstOrder over {mode} (current selection). The runtime " "could be improved greatly with low memory usage.", stacklevel=2, ) return # Check network density and recommend appropriate mode g_size = g.num_nodes g_dens = g.density if (g_dens >= 0.2) & (mode != "DenseOTF"): warnings.warn( f"Network density = {g_dens:.3f} (> 0.2), it is recommended to use " f"DenseOTF over {mode} (current selection)", stacklevel=2, ) if (g_dens < 0.001) & (g_size < 10000) & (mode != "PreComp"): warnings.warn( f"Network density = {g_dens:.2e} (< 0.001) with {g_size} nodes " f"(< 10000), it is recommended to use PreComp over {mode} (current " "selection)", stacklevel=2, ) if (g_dens >= 0.001) & (g_dens < 0.2) & (mode != "SparseOTF"): warnings.warn( f"Network density = {g_dens:.3f}, it is recommended to use " f"SparseOTF over {mode} (current selection)", stacklevel=2, ) if (g_dens < 0.001) & (g_size >= 10000) & (mode != "SparseOTF"): warnings.warn( f"Network density = {g_dens:.3f} (< 0.001) with {g_size} nodes " f"(>= 10000), it is recommended to use SparseOTF over {mode} " "(current selection)", stacklevel=2, ) @Timer("load Graph") def read_graph(args): """Read input network to memory. Depending on the mode selected, reads the network either in CSR representation (``PreComp`` and ``SparseOTF``) or 2d numpy array (``DenseOTF``). """ path = args.input output = args.output p = args.p q = args.q workers = args.workers verbose = args.verbose weighted = args.weighted directed = args.directed extend = args.extend gamma = args.gamma random_state = args.random_state mode = args.mode task = args.task delimiter = args.delimiter implicit_ids = args.implicit_ids if directed and extend: raise NotImplementedError("Node2vec+ not implemented for directed graph yet.") if extend and not weighted: print("NOTE: node2vec+ is equivalent to node2vec for unweighted graphs.") if task in ["tocsr", "todense"]: # perform conversion then save and exit g = graph.SparseGraph() if task == "tocsr" else graph.DenseGraph() g.read_edg(path, weighted, directed, delimiter) g.save(output) exit() pecanpy_mode = getattr(pecanpy, mode, None) g = pecanpy_mode(p, q, workers, verbose, extend, gamma, random_state) if path.endswith(".npz"): g.read_npz(path, weighted, implicit_ids=implicit_ids) else: g.read_edg(path, weighted, directed, delimiter) check_mode(g, args) return g @Timer("train embeddings") def learn_embeddings(args, walks): """Learn embeddings by optimizing the Skipgram objective using SGD.""" model = Word2Vec( walks, vector_size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers, epochs=args.epochs, seed=args.random_state, ) output_path = args.output if output_path.endswith(".npz"): np.savez(output_path, IDs=model.wv.index_to_key, data=model.wv.vectors) else: model.wv.save_word2vec_format(output_path) @Timer("pre-compute transition probabilities") def preprocess(g): """Preprocessing transition probabilities with timer.""" g.preprocess_transition_probs() @Timer("generate walks") def simulate_walks(args, g): """Simulate random walks with timer.""" return g.simulate_walks(args.num_walks, args.walk_length) def main(): """Pipeline for representational learning for all nodes in a graph.""" args = parse_args() if args.workers == 0: args.workers = numba.config.NUMBA_DEFAULT_NUM_THREADS numba.set_num_threads(args.workers) g = read_graph(args) preprocess(g) walks = simulate_walks(args, g) learn_embeddings(args, walks) if __name__ == "__main__": main() ================================================ FILE: src/pecanpy/experimental.py ================================================ """Experimental features.""" import numpy as np from numba import njit from pecanpy.pecanpy import Base from pecanpy.rw.dense_rw import DenseRWGraph class Node2vecPlusPlus(Base, DenseRWGraph): """Continuous extension of node2vec+ with DenseOTF framework. In node2vec+ (see `DenseRWGraph.get_extended_normalized_probs`), there is discontinuous region of the bias-factor (alpha). More specifically, the transition between the noisy-edge region (w1 < 1 and w2 < 1, where w1 is the normalized edge weight connecting from current to the previous node, and w2 is similarly defined for the edge weight connecting from the next to the previous node), and the "in-out" region (w1 > 1 or w2 > 1). This continuous extension version of node2vec+, i.e., node2vec++, aims to provide continuity to those regions by parameterizing the bias-factor as a continuous function of w1 and w2. The basic idea is to use w2 to control the interpolation between 1 and 1 / q as before, but in addition, use w1 to parameterize the curvature of the interpolation, so as w1 approaches zero, the bias-factor goes to min{1, 1 / q} (note that previously, the bias-factor is set to min{1, 1 / q} whenever w1 falls below one). """ def __init__(self, *args, **kwargs): Base.__init__(self, *args, **kwargs) def get_move_forward(self): """Wrap ``move_forward``.""" data = self.data nonzero = self.nonzero p = self.p q = self.q noise_thresholds = self.get_noise_thresholds() get_normalized_probs = self.get_normalized_probs @njit(nogil=True) def move_forward(cur_idx, prev_idx=None): """Move to next node.""" normalized_probs = get_normalized_probs( data, nonzero, p, q, cur_idx, prev_idx, noise_thresholds, ) cdf = np.cumsum(normalized_probs) choice = np.searchsorted(cdf, np.random.random()) nbrs = np.where(nonzero[cur_idx])[0] return nbrs[choice] return move_forward @staticmethod @njit(nogil=True) def get_normalized_probs( data, nonzero, p, q, cur_idx, prev_idx, noise_threshold_ary, ): """Calculate node2vec++ transition probabilities.""" cur_nbrs_ind = nonzero[cur_idx] cur_nbrs_weight = data[cur_idx].copy() if prev_idx is not None: # 2nd order biased walks prev_nbrs_weight = data[prev_idx].copy() # Note: we assume here the network is undirected, hence the edge # weight connecting the next to prev is the same as the reverse. out_ind = cur_nbrs_ind & (prev_nbrs_weight < noise_threshold_ary) out_ind[prev_idx] = False # exclude previous state from out biases t = prev_nbrs_weight[out_ind] / noise_threshold_ary[out_ind] # Determine whether to use '1 - t' or 't' depending on whether q # is less than or greater than one so that alpha is suppressed to # min{1, 1 / q} as w1 approaches 0. t = 1 - t.clip(0, 1) if q < 1 else t.clip(0, 1) b = cur_nbrs_weight[out_ind] / noise_threshold_ary[out_ind] # compute out biases scale = np.abs(1 - 1 / q) offset = np.minimum(1, 1 / q) alpha = t * b / (1 + (b - 1)) * scale + offset cur_nbrs_weight[out_ind] *= alpha # apply out biases cur_nbrs_weight[prev_idx] /= p # apply the return bias unnormalized_probs = cur_nbrs_weight[cur_nbrs_ind] normalized_probs = unnormalized_probs / unnormalized_probs.sum() return normalized_probs ================================================ FILE: src/pecanpy/graph.py ================================================ """Lite graph objects used by pecanpy.""" import warnings import numpy as np from .typing import AdjMat from .typing import AdjNonZeroMat from .typing import CSR from .typing import Dict from .typing import Float32Array from .typing import Iterator from .typing import List from .typing import Optional from .typing import Sequence from .typing import Tuple from .typing import Uint32Array class BaseGraph: """Base Graph object. Handles node id and provides general properties including num_nodes, and density. The num_edges property is to be specified by the derived graph objects. """ def __init__(self): self._node_ids: List[str] = [] self._node_idmap: Dict[str, int] = {} # id -> index @property def nodes(self) -> List[str]: """Return the list of node IDs.""" return self._node_ids @property def num_nodes(self) -> int: """Return the number of nodes in the graph.""" return len(self.nodes) @property def num_edges(self) -> int: """Return the number of edges in the graph.""" raise NotImplementedError( f"{self.__class__.__name__} does not have num_edges, use the " f"derived classes like SparseGraph and DenseGraph instead.", ) @property def density(self) -> float: """Return the edge density of the graph.""" return self.num_edges / self.num_nodes / (self.num_nodes - 1) def set_node_ids( self, node_ids: Optional[Sequence[str]], implicit_ids: bool = False, num_nodes: Optional[int] = None, ): """Update ID list and mapping. Set _node_ids given the input node IDs and also set the corresponding _node_idmap based on it, which maps from node ID to the index. Args: node_ids (:obj:`list` of :obj:`str`, optional): List of node IDs to use. If not available, will implicitly set node IDs to the canonical ordering of nodes with a warning message, which is suppressed if `implicit_ids` is set to True. implicit_ids (bool): Implicitly set the node IDs to the canonical node ordering. If set to False and node IDs are not available, it will also set implicit node IDs, but with a warning message. The warning message can be suppressed if `implicit_ids` is set to True as a confirmation of the behavior. num_nodes (int, optional): Number of nodes, used when try to set implicit node IDs. """ if (node_ids is not None) and (not implicit_ids): self._node_ids = list(node_ids) elif num_nodes is None: raise ValueError( "Need to specify `num_nodes` when setting implicit node IDs.", ) else: self.set_node_ids(list(map(str, range(num_nodes)))) if not implicit_ids: warnings.warn( "WARNING: Implicitly set node IDs to the canonical node " "ordering due to missing IDs field in the raw CSR npz " "file. This warning message can be suppressed by setting " "implicit_ids to True in the read_npz function call, or " "by setting the --implicit_ids flag in the CLI", stacklevel=2, ) self._node_idmap = {j: i for i, j in enumerate(self._node_ids)} def get_has_nbrs(self): """Abstract method to be specified by derived classes.""" raise NotImplementedError def get_move_forward(self): """Abstract method to be specified by derived classes.""" raise NotImplementedError class AdjlstGraph(BaseGraph): """Adjacency list Graph object used for reading/writing edge list files. Sparse Graph object that stores graph as adjacency list. Note: AdjlstGraph is only used for reading/writing edge list files and do not support random walk computations since Numba njit do not work with Python data structures like list and dict. Examples: Read ``.edg`` file and create ``SparseGraph`` object using ``.read_edg`` method. >>> from pecanpy.graph import AdjlstGraph >>> >>> # initialize SparseGraph object >>> g = AdjlstGraph() >>> >>> # read graph from edgelist >>> g.read(path_to_edg_file, weighted=True, directed=False) >>> >>> indptr, indices, data = g.to_csr() # convert to csr >>> >>> dense_mat = g.to_dense() # convert to dense adjacency matrix >>> >>> g.save(edg_outpath) # save the graph to an edge list file """ def __init__(self): super().__init__() self._data: List[Dict[int, float]] = [] # list of nbrs idx -> weights self._num_edges: int = 0 @property def edges_iter(self) -> Iterator[Tuple[int, int, float]]: """Return an iterator that iterates over all edges.""" for head, head_nbrs in enumerate(self._data): for tail in sorted(head_nbrs): yield head, tail, head_nbrs[tail] @property def edges(self) -> List[Tuple[int, int, float]]: """Return a list of triples (head, tail, weight) representing edges.""" return list(self.edges_iter) @property def num_edges(self): """Return the number of edges in the graph.""" return self._num_edges @staticmethod def _read_edge_line( edge_line: str, weighted: bool, delimiter: str, ) -> Tuple[str, str, float]: """Read a line from the edge list file.""" terms = edge_line.strip().split(delimiter) id1, id2 = terms[0].strip(), terms[1].strip() weight = 1.0 if weighted: if len(terms) != 3: raise ValueError( f"Expecting three columns in the edge list file for a " f"weighted graph, got {len(terms)} instead: {edge_line!r}", ) weight = float(terms[-1]) return id1, id2, weight @staticmethod def _is_valid_edge_weight(id1: str, id2: str, weight: float) -> bool: """Check if the edge weight is non-negative.""" if weight <= 0: edg_str = f"w({id1},{id2}) = {weight}" warnings.warn( f"Non-positive edge ignored: {edg_str}", RuntimeWarning, stacklevel=2, ) return False return True def _check_edge_existence( self, id1: str, id2: str, idx1: int, idx2: int, weight: float, ): """Check if an edge exists. If the edge to be added already exists and the new edge weight is different from the existing edge weights, print warning message. """ if idx2 in self._data[idx1] and self._data[idx1][idx2] != weight: warnings.warn( f"edge from {id1} to {id2} exists, with " f"value of {self._data[idx1][idx2]:.2f}. " f"Now overwrite to {weight:.2f}.", RuntimeWarning, stacklevel=2, ) def get_node_idx(self, node_id: str) -> int: """Get index of the node and create new node when necessary.""" self.add_node(node_id) return self._node_idmap[node_id] def add_node(self, node_id: str): """Create a new node. Add a new node to the graph if not already existing, by updating the ID list, ID map, and the adjacency list data. Otherwise pass through without further actions. Note: Does not raise error even if the node alrealy exists. """ if node_id not in self._node_idmap: self._node_idmap[node_id] = self.num_nodes self.nodes.append(node_id) self._data.append({}) def _add_edge_from_idx(self, idx1: int, idx2: int, weight: float): """Add an edge based on the head and tail node index with weight.""" self._data[idx1][idx2] = weight self._num_edges += 1 def add_edge( self, id1: str, id2: str, weight: float = 1.0, directed: bool = False, ): """Add an edge to the graph. Note: Non-positive edges are ignored. Args: id1 (str): first node id. id2 (str): second node id. weight (float): the edge weight, default is 1.0 directed (bool): whether the edge is directed or not. """ if self._is_valid_edge_weight(id1, id2, weight): idx1, idx2 = map(self.get_node_idx, (id1, id2)) self._check_edge_existence(id1, id2, idx1, idx2, weight) self._add_edge_from_idx(idx1, idx2, weight) if not directed: self._add_edge_from_idx(idx2, idx1, weight) def read( self, path: str, weighted: bool, directed: bool, delimiter: str = "\t", ): """Read an edgelist file and create sparse graph. Note: Implicitly discard zero weighted edges; if the same edge is defined multiple times with different edge weights, then the last specified weight will be used (warning for such behavior will be printed). Args: path (str): path to edgelist file, where the file is tab separated and contains 2 or 3 columns depending on whether the input graph is weighted, where the the first column contains the source nodes and the second column contains the destination nodes that interact with the corresponding source nodes. weighted (bool): whether the graph is weighted. If unweighted, only two columns are expected in the edgelist file, and the edge weights are implicitly set to 1 for all interactions. If weighted, a third column encoding the weight of the interaction in numeric value is expected. directed (bool): whether the graph is directed, if undirected, the edge connecting from destination node to source node is created with same edge weight from source node to destination node. delimiter (str): delimiter of the edge list file, default is tab. """ with open(path, encoding="utf-8") as f: for edge_line in f: edge = self._read_edge_line(edge_line, weighted, delimiter) self.add_edge(*edge, directed) def save(self, path: str, unweighted: bool = False, delimiter: str = "\t"): """Save AdjLst as an ``.edg`` edge list file. Args: unweighted (bool): If set to True, only write two columns, corresponding to the head and tail nodes of the edges, and ignore the edge weights (default: :obj:`False`). delimiter (str): Delimiter for separating fields. """ with open(path, "w", encoding="utf-8") as f: for h, t, w in self.edges_iter: h_id, t_id = self.nodes[h], self.nodes[t] terms = (h_id, t_id) if unweighted else (h_id, t_id, str(w)) f.write(f"{delimiter.join(terms)}\n") def to_csr(self) -> CSR: """Construct compressed sparse row matrix.""" indptr: Uint32Array = np.zeros(len(self.nodes) + 1, dtype=np.uint32) for i, row_data in enumerate(self._data): indptr[i + 1] = indptr[i] + len(row_data) # last element of indptr indicates the total number of nonzero entries indices = np.zeros(indptr[-1], dtype=np.uint32) data = np.zeros(indptr[-1], dtype=np.float32) for i, nbrs in enumerate(self._data): if len(nbrs) == 0: continue new_indices, new_data = zip(*[(j, nbrs[j]) for j in sorted(nbrs)]) chunk = slice(indptr[i], indptr[i + 1]) indices[chunk] = np.array(new_indices, dtype=np.uint32) data[chunk] = np.array(new_data, dtype=np.float32) return indptr, indices, data def to_dense(self) -> AdjMat: """Construct dense adjacency matrix. Note: This method does not return a DenseGraph object, but instead returns a dense adjacency matrix as NDArray, where the index is the same as that of ``nodes``. Return: NDArray: Full adjacency matrix as 2d numpy array. """ n_nodes = len(self.nodes) mat = np.zeros((n_nodes, n_nodes)) for src_node, src_nbrs in enumerate(self._data): for dst_node in src_nbrs: mat[src_node, dst_node] = src_nbrs[dst_node] return mat @classmethod def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs): """Construct graph using adjacency matrix and node IDs. Args: adj_mat(NDArray): 2D numpy array of adjacency matrix node_ids(:obj:`list` of str): node ID list Return: An adjacency graph object representing the adjacency matrix. """ g = cls(**kwargs) # Setup node idmap in the order of node_ids for node_id in node_ids: g.add_node(node_id) # Fill in edge data for idx1, idx2 in zip(*np.where(adj_mat != 0)): g._add_edge_from_idx(idx1, idx2, adj_mat[idx1, idx2]) return g class SparseGraph(BaseGraph): """Sparse Graph object that stores graph as adjacency list. Examples: Read ``.edg`` file and create ``SparseGraph`` object using ``.read_edg`` method. >>> from pecanpy.graph import SparseGraph >>> >>> # initialize SparseGraph object >>> g = SparseGraph() >>> >>> # read graph from edgelist >>> g.read_edg(path_to_edg_file, weighted=True, directed=False) >>> >>> # save the csr graph as npz file to be used later >>> g.save(npz_outpath) """ def __init__(self): super().__init__() self.data: Optional[Float32Array] = None self.indptr: Optional[Uint32Array] = None self.indices: Optional[Uint32Array] = None @property def num_edges(self) -> int: """Return the number of edges in the graph.""" if self.indptr is not None: return self.indptr[-1] else: raise ValueError("Empty graph.") def read_edg( self, path: str, weighted: bool, directed: bool, delimiter: str = "\t", ): """Create CSR sparse graph from edge list. First create ``AdjlstGraph`` by reading the edge list file, and then convert to ``SparseGraph`` via ``to_csr``. Args: path (str): path to edgelist file. weighted (bool): whether the graph is weighted. directed (bool): whether the graph is directed. delimiter (str): delimiter used between node IDs. """ g = AdjlstGraph() g.read(path, weighted, directed, delimiter) self.set_node_ids(g.nodes) self.indptr, self.indices, self.data = g.to_csr() def read_npz(self, path: str, weighted: bool, implicit_ids: bool = False): """Directly read a CSR sparse graph. Note: To generate a CSR file compatible with PecanPy, first load the graph as a sparse graph using the SparseGraph (with ``csr=True``). Then save the sparse graph to a csr file using the ``save`` method from ``SparseGraph``. The saved ``.npz`` file can then be loaded directly by ``SparseGraph`` later. Args: path (str): path to the csr file, which is an npz file with four arrays with keys 'IDs', 'data', 'indptr', 'indices', which correspond to the node IDs, the edge weights, the offset array for each node, and the indices of the edges. weighted (bool): whether the graph is weighted, if unweighted, all edge weights will be converted to 1. directed (bool): not used, for compatibility with ``SparseGraph``. implicit_ids (bool): Implicitly set the node IDs to the canonical node ordering from the CSR graph. If unset and the `IDs` field is not found in the input CSR graph, a warning message will be displayed on screen. The missing `IDs` field can happen, for example, when the user uses the CSR graph prepared by `scipy.sparse.csr`. """ raw = np.load(path) self.indptr = raw["indptr"].astype(np.uint32) self.indices = raw["indices"].astype(np.uint32) self.data = raw["data"].astype(np.float32) if self.data is None: raise ValueError("Adjacency matrix data not found.") elif not weighted: self.data[:] = 1.0 # overwrite edge weights with constant self.set_node_ids( raw.get("IDs"), implicit_ids=implicit_ids, num_nodes=int(self.indptr.size - 1), ) def save(self, path: str): """Save CSR as ``.csr.npz`` file.""" np.savez( path, IDs=self.nodes, data=self.data, indptr=self.indptr, indices=self.indices, ) @classmethod def from_adjlst_graph(cls, adjlst_graph, **kwargs): """Construct csr graph from adjacency list graph. Args: adjlst_graph (:obj:`pecanpy.graph.AdjlstGraph`): Adjacency list graph to be converted. """ g = cls(**kwargs) g.set_node_ids(adjlst_graph.nodes) g.indptr, g.indices, g.data = adjlst_graph.to_csr() return g @classmethod def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs): """Construct csr graph using adjacency matrix and node IDs. Note: Only consider positive valued edges. Args: adj_mat(NDArray): 2D numpy array of adjacency matrix node_ids(:obj:`list` of str): node ID list """ g = cls(**kwargs) g.set_node_ids(node_ids) adjlst_graph = AdjlstGraph.from_mat(adj_mat, node_ids) g.indptr, g.indices, g.data = adjlst_graph.to_csr() return g class DenseGraph(BaseGraph): """Dense Graph object that stores graph as array. Examples: Read ``.npz`` files and create ``DenseGraph`` object using ``read_npz`` >>> from pecanpy.graph import DenseGraph >>> >>> g = DenseGraph() # initialize DenseGraph object >>> >>> g.read_npz(paht_to_npz_file, weighted=True, directed=False) Read ``.edg`` files and create ``DenseGraph`` object using ``read_edg`` >>> from pecanpy.graph import DenseGraph >>> >>> # initialize DenseGraph object >>> g = DenseGraph() >>> >>> # read graph from edgelist >>> g.read_edg(path_to_edg_file, weighted=True, directed=False) >>> >>> # save the dense graph as npz file to be used later >>> g.save(npz_outpath) """ def __init__(self): super().__init__() self._data: Optional[AdjMat] = None self._nonzero: Optional[AdjNonZeroMat] = None @property def num_edges(self) -> int: """Return the number of edges in the graph.""" if self.nonzero is not None: return self.nonzero.sum() else: raise ValueError("Empty graph.") @property def data(self) -> Optional[AdjMat]: """Return the adjacency matrix.""" return self._data @data.setter def data(self, data: AdjMat): """Set adjacency matrix and the corresponding nonzero matrix.""" self._data = data.astype(float) self._nonzero = np.array(self._data != 0, dtype=bool) @property def nonzero(self) -> Optional[AdjNonZeroMat]: """Return the nonzero mask for the adjacency matrix.""" return self._nonzero def read_npz(self, path: str, weighted: bool, implicit_ids: bool = False): """Read ``.npz`` file and create dense graph. Args: path (str): path to ``.npz`` file. weighted (bool): whether the graph is weighted, if unweighted, all none zero weights will be converted to 1. implicit_ids (bool): Implicitly set the node IDs to the canonical ordering from the dense adjacency matrix object. If unset and the `IDs` field is not found in the object, a warning message will be displayed on screen. This warning message can be suppressed if `implicit_ids` is set to True as a confirmation of the behavior. """ raw = np.load(path) self.data = raw["data"] if not weighted: # overwrite edge weights with constant self.data = self.nonzero * 1.0 # type: ignore self.set_node_ids( raw.get("IDs"), implicit_ids=implicit_ids, num_nodes=self.data.shape[0], ) def read_edg( self, path: str, weighted: bool, directed: bool, delimiter: str = "\t", ): """Read an edgelist file and construct dense graph.""" g = AdjlstGraph() g.read(path, weighted, directed, delimiter) self.set_node_ids(g.nodes) self.data = g.to_dense() def save(self, path: str): """Save dense graph as ``.dense.npz`` file.""" np.savez(path, data=self.data, IDs=self.nodes) @classmethod def from_adjlst_graph(cls, adjlst_graph, **kwargs): """Construct dense graph from adjacency list graph. Args: adjlst_graph (:obj:`pecanpy.graph.AdjlstGraph`): Adjacency list graph to be converted. """ g = cls(**kwargs) g.set_node_ids(adjlst_graph.nodes) g.data = adjlst_graph.to_dense() return g @classmethod def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs): """Construct dense graph using adjacency matrix and node IDs. Args: adj_mat(NDArray): 2D numpy array of adjacency matrix node_ids(:obj:`list` of str): node ID list """ g = cls(**kwargs) g.data = adj_mat g.set_node_ids(node_ids) return g ================================================ FILE: src/pecanpy/pecanpy.py ================================================ """Different strategies for generating node2vec walks.""" import numpy as np from gensim.models import Word2Vec from numba import njit from numba import prange from numba_progress import ProgressBar from .graph import BaseGraph from .rw import DenseRWGraph from .rw import SparseRWGraph from .typing import Embeddings from .typing import Float32Array from .typing import HasNbrs from .typing import List from .typing import MoveForward from .typing import Optional from .typing import Uint32Array from .typing import Uint64Array from .wrappers import Timer try: from numba.np.ufunc.parallel import get_thread_id except ImportError: # numba<0.56 from numba.np.ufunc.parallel import _get_thread_id as get_thread_id class Base(BaseGraph): """Base node2vec object. This base object provides the skeleton for the node2vec walk algorithm, which consists of the ``simulate_walks`` method that generate node2vec random walks. In contrast to the original Python implementation of node2vec, it is parallelized where each process generates walks independently. Args: p (float): return parameter, value less than 1 encourages returning back to previous vertex, and discourage for value grater than 1 (default: 1). q (float): in-out parameter, value less than 1 encourages walks to go "outward", and value greater than 1 encourage walking within a localized neighborhood (default: 1) workers (int): number of threads to be spawned for running node2vec including walk generation and word2vec embedding (default: 1) verbose (bool): show progress bar for walk generation. extend (bool): use node2vec+ extension if set to :obj:`True` (default: :obj:`False`). gamma (float): Multiplication factor for the std term of edge weights added to the average edge weights as the noisy edge threshold, only used by node2vec+ (default: 0) random_state (int, optional): Random seed for generating random walks. Note that to fully ensure reproducibility, use single thread (i.e., workers=1), and potentially need to set the Python environment variable ``PYTHONHASHSEED`` to match the random_state (default: :obj:`None`). Note: The ``preprocess_transition_probs`` is required for implementations that precomputes and stores 2nd order transition probabilities. Examples: Generate node2vec embeddings >>> from pecanpy import pecanpy as node2vec >>> >>> # initialize node2vec object, similarly for SparseOTF and DenseOTF >>> g = node2vec.PreComp(p=0.5, q=1, workers=4, verbose=True) >>> # alternatively, can specify ``extend=True`` for using node2vec+ >>> >>> # load graph from edgelist file >>> g.read_edg(path_to_edg_file, weighted=True, directed=False) >>> # precompute and save 2nd order transition probs (for PreComp only) >>> g.preprocess_transition_probs() >>> >>> # generate random walks, which could then be used to train w2v >>> walks = g.simulate_walks(num_walks=10, walk_length=80) >>> >>> # alternatively, generate the embeddings directly using ``embed`` >>> emd = g.embed() """ def __init__( self, p: float = 1, q: float = 1, workers: int = 1, verbose: bool = False, extend: bool = False, gamma: float = 0, random_state: Optional[int] = None, ): super().__init__() self.p = p self.q = q self.workers = workers # TODO: not doing anything, need to fix. self.verbose = verbose self.extend = extend self.gamma = gamma self.random_state = random_state self._preprocessed: bool = False def _map_walk(self, walk_idx_ary: Uint32Array) -> List[str]: """Map walk from node index to node ID. Note: The last element in the ``walk_idx_ary`` encodes the effective walk length. Only walk indices up to the effective walk length are translated (mapped to node IDs). """ end_idx = walk_idx_ary[-1] walk = [self.nodes[i] for i in walk_idx_ary[:end_idx]] return walk def simulate_walks( self, num_walks: int, walk_length: int, ) -> List[List[str]]: """Generate walks starting from each nodes ``num_walks`` time. Note: This is the master process that spawns worker processes, where the worker function ``node2vec_walks`` genearte a single random walk starting from a vertex of the graph. Args: num_walks (int): number of walks starting from each node. walks_length (int): length of walk. """ self._preprocess_transition_probs() nodes = np.array(range(self.num_nodes), dtype=np.uint32) start_node_idx_ary = np.concatenate([nodes] * num_walks) tot_num_jobs = start_node_idx_ary.size random_state = self.random_state np.random.seed(random_state) np.random.shuffle(start_node_idx_ary) # for balanced work load move_forward = self.get_move_forward() has_nbrs = self.get_has_nbrs() verbose = self.verbose # Acquire numba progress proxy for displaying the progress bar with ProgressBar(total=tot_num_jobs, disable=not verbose) as progress: walk_idx_mat = self._random_walks( tot_num_jobs, walk_length, random_state, start_node_idx_ary, has_nbrs, move_forward, progress, ) # Map node index back to node ID walks = [self._map_walk(walk_idx_ary) for walk_idx_ary in walk_idx_mat] return walks @staticmethod @njit(parallel=True, nogil=True) def _random_walks( tot_num_jobs: int, walk_length: int, random_state: Optional[int], start_node_idx_ary: Uint32Array, has_nbrs: HasNbrs, move_forward: MoveForward, progress_proxy: ProgressBar, ) -> Uint32Array: """Simulate a random walk starting from start node.""" # Seed the random number generator if random_state is not None: np.random.seed(random_state + get_thread_id()) # use the last entry of each walk index array to keep track of the # effective walk length walk_idx_mat: Uint32Array = np.zeros( (tot_num_jobs, walk_length + 2), dtype=np.uint32, ) walk_idx_mat[:, 0] = start_node_idx_ary # initialize seeds walk_idx_mat[:, -1] = walk_length + 1 # set to full walk length by default for i in prange(tot_num_jobs): # initialize first step as normal random walk start_node_idx = walk_idx_mat[i, 0] if has_nbrs(start_node_idx): walk_idx_mat[i, 1] = move_forward(start_node_idx) else: walk_idx_mat[i, -1] = 1 continue # start bias random walk for j in range(2, walk_length + 1): cur_idx = walk_idx_mat[i, j - 1] if has_nbrs(cur_idx): prev_idx = walk_idx_mat[i, j - 2] walk_idx_mat[i, j] = move_forward(cur_idx, prev_idx) else: walk_idx_mat[i, -1] = j break progress_proxy.update(1) return walk_idx_mat def setup_get_normalized_probs(self): """Transition probability computation setup. This function performs necessary preprocessing of computing the average edge weights array, which is used later by the transition probability computation function ``get_extended_normalized_probs``, if node2vec+ is used. Otherwise, returns the normal transition function ``get_noramlized_probs`` with a trivial placeholder for average edge weights array ``noise_thresholds``. """ if self.extend: # use n2v+ get_normalized_probs = self.get_extended_normalized_probs noise_thresholds = self.get_noise_thresholds() else: # use normal n2v get_normalized_probs = self.get_normalized_probs noise_thresholds = None return get_normalized_probs, noise_thresholds def preprocess_transition_probs(self): """Null default preprocess method.""" pass def _preprocess_transition_probs(self): if not self._preprocessed: self.preprocess_transition_probs() self._preprocessed = True def embed( self, dim: int = 128, num_walks: int = 10, walk_length: int = 80, window_size: int = 10, epochs: int = 1, verbose: bool = False, ) -> Embeddings: """Generate embeddings. This is a shortcut function that combines ``simulate_walks`` with ``Word2Vec`` to generate the node2vec embedding. Note: The resulting embeddings are aligned with the graph, i.e., the index of embeddings is the same as that for the graph. Args: dim (int): dimension of the final embedding, default is 128 num_walks (int): number of random walks generated using each node as the seed node, default is 10 walk_length (int): length of the random walks, default is 80 window_size (int): context window sized for training the ``Word2Vec`` model, default is 10 epochs (int): number of epochs for training ``Word2Vec``, default is 1 verbose (bool): print time usage for random walk generation and skip-gram training if set to True Return: Embeddings: The embedding matrix, each row is a node embedding vector. The index is the same as that for the graph. """ timed_walk = Timer("generate walks", verbose)(self.simulate_walks) timed_w2v = Timer("train embeddings", verbose)(Word2Vec) walks = timed_walk(num_walks, walk_length) w2v = timed_w2v( walks, vector_size=dim, window=window_size, sg=1, min_count=0, workers=self.workers, epochs=epochs, seed=self.random_state, ) return w2v.wv[self.nodes] class FirstOrderUnweighted(Base, SparseRWGraph): """Directly sample edges for first order random walks.""" def __init__(self, *args, **kwargs): Base.__init__(self, *args, **kwargs) def get_move_forward(self): """Wrap ``move_forward``.""" indices = self.indices indptr = self.indptr @njit(nogil=True) def move_forward(cur_idx, prev_idx=None): start, end = indptr[cur_idx], indptr[cur_idx + 1] return indices[np.random.randint(start, end)] return move_forward class PreCompFirstOrder(Base, SparseRWGraph): """Precompute transition probabilities for first order random walks.""" def __init__(self, *args, **kwargs): Base.__init__(self, *args, **kwargs) self.alias_j = self.alias_q = None def get_move_forward(self): """Wrap ``move_forward``.""" indices = self.indices indptr = self.indptr alias_j = self.alias_j alias_q = self.alias_q @njit(nogil=True) def move_forward(cur_idx, prev_idx=None): start, end = indptr[cur_idx], indptr[cur_idx + 1] choice = alias_draw(alias_j[start:end], alias_q[start:end]) return indices[indptr[cur_idx] + choice] return move_forward def preprocess_transition_probs(self): """Precompute and store first order transition probabilities.""" data = self.data indices = self.indices indptr = self.indptr # Retrieve transition probability computation callback function get_normalized_probs = self.get_normalized_probs_first_order # Determine the dimensionality of the 1st order transition probs n_nodes = indptr.size - 1 # number of nodes n_probs = indptr[-1] # total number of 1st order transition probs @njit(parallel=True, nogil=True) def compute_all_transition_probs(): alias_j = np.zeros(n_probs, dtype=np.uint32) alias_q = np.zeros(n_probs, dtype=np.float32) for idx in range(n_nodes): start, end = indptr[idx], indptr[idx + 1] probs = get_normalized_probs(data, indices, indptr, idx) alias_j[start:end], alias_q[start:end] = alias_setup(probs) return alias_j, alias_q self.alias_j, self.alias_q = compute_all_transition_probs() class PreComp(Base, SparseRWGraph): """Precompute transition probabilities. This implementation precomputes and stores 2nd order transition probabilities first and uses read off transition probabilities during the process of random walk. The graph type used is ``SparseRWGraph``. Note: Need to call ``preprocess_transition_probs()`` first before generating walks. """ def __init__(self, *args, **kwargs): Base.__init__(self, *args, **kwargs) self.alias_dim: Optional[Uint32Array] = None self.alias_j: Optional[Uint32Array] = None self.alias_q: Optional[Float32Array] = None self.alias_indptr: Optional[Uint64Array] = None def get_move_forward(self): """Wrap ``move_forward``. This function returns a ``numba.njit`` compiled function that takes current vertex index (and the previous vertex index if available) and returns the next vertex index by sampling from a discrete random distribution based on the transition probabilities that are read off the precomputed transition probabilities table. Note: The returned function is used by the ``simulate_walks`` method. """ data = self.data indices = self.indices indptr = self.indptr p = self.p q = self.q get_normalized_probs = self.get_normalized_probs alias_j = self.alias_j alias_q = self.alias_q alias_indptr = self.alias_indptr alias_dim = self.alias_dim @njit(nogil=True) def move_forward(cur_idx, prev_idx=None): """Move to next node based on transition probabilities.""" if prev_idx is None: normalized_probs = get_normalized_probs( data, indices, indptr, p, q, cur_idx, None, None, ) cdf = np.cumsum(normalized_probs) choice = np.searchsorted(cdf, np.random.random()) else: # Find index of neighbor (previous node) for reading alias start = indptr[cur_idx] end = indptr[cur_idx + 1] nbr_idx = np.searchsorted(indices[start:end], prev_idx) if indices[start + nbr_idx] != prev_idx: print("FATAL ERROR! Neighbor not found.") dim = alias_dim[cur_idx] start = alias_indptr[cur_idx] + dim * nbr_idx end = start + dim choice = alias_draw(alias_j[start:end], alias_q[start:end]) return indices[indptr[cur_idx] + choice] return move_forward def preprocess_transition_probs(self): """Precompute and store 2nd order transition probabilities. Each node contains n ** 2 number of 2nd order transition probabilities, where n is the number of neighbors of that specific node, since one can pick any one of its neighbors as the previous node and / or the next node. For each second order transition probability of a node, set up the alias draw table to be used during random walk. Note: Uses uint64 instead of uint32 for tracking alias_indptr to prevent overflowing since the 2nd order transition probs grows much faster than the first order transition probs, which is the same as the total number of edges in the graph. """ data = self.data indices = self.indices indptr = self.indptr p = self.p q = self.q # Retrieve transition probability computation callback function get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs() # Determine the dimensionality of the 2nd order transition probs n_nodes = self.indptr.size - 1 # number of nodes n = self.indptr[1:] - self.indptr[:-1] # number of nbrs per node n2 = np.power(n, 2) # number of 2nd order trans probs per node # Set the dimensionality of alias probability table self.alias_dim = alias_dim = n self.alias_indptr = alias_indptr = np.zeros(self.indptr.size, dtype=np.uint64) alias_indptr[1:] = np.cumsum(n2) n_probs = alias_indptr[-1] # total number of 2nd order transition probs @njit(parallel=True, nogil=True) def compute_all_transition_probs(): alias_j = np.zeros(n_probs, dtype=np.uint32) alias_q = np.zeros(n_probs, dtype=np.float32) for idx in range(n_nodes): offset = alias_indptr[idx] dim = alias_dim[idx] nbrs = indices[indptr[idx] : indptr[idx + 1]] for nbr_idx in prange(n[idx]): nbr = nbrs[nbr_idx] probs = get_normalized_probs( data, indices, indptr, p, q, idx, nbr, noise_thresholds, ) start = offset + dim * nbr_idx end = start + dim alias_j[start:end], alias_q[start:end] = alias_setup(probs) return alias_j, alias_q self.alias_j, self.alias_q = compute_all_transition_probs() class SparseOTF(Base, SparseRWGraph): """Sparse graph transition on the fly. This implementation does *NOT* precompute transition probabilities in advance but instead calculates them on-the-fly during the process of random walk. The graph type used is ``SparseRWGraph``. """ def __init__(self, *args, **kwargs): Base.__init__(self, *args, **kwargs) def get_move_forward(self): """Wrap ``move_forward``. This function returns a ``numba.njit`` compiled function that takes current vertex index (and the previous vertex index if available) and returns the next vertex index by sampling from a discrete random distribution based on the transition probabilities that are calculated on-the-fly. Note: The returned function is used by the ``simulate_walks`` method. """ data = self.data indices = self.indices indptr = self.indptr p = self.p q = self.q get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs() @njit(nogil=True) def move_forward(cur_idx, prev_idx=None): """Move to next node.""" normalized_probs = get_normalized_probs( data, indices, indptr, p, q, cur_idx, prev_idx, noise_thresholds, ) cdf = np.cumsum(normalized_probs) choice = np.searchsorted(cdf, np.random.random()) return indices[indptr[cur_idx] + choice] return move_forward class DenseOTF(Base, DenseRWGraph): """Dense graph transition on the fly. This implementation does *NOT* precompute transition probabilities in advance but instead calculates them on-the-fly during the process of random walk. The graph type used is ``DenseRWGraph``. """ def __init__(self, *args, **kwargs): Base.__init__(self, *args, **kwargs) def get_move_forward(self): """Wrap ``move_forward``. This function returns a ``numba.njit`` compiled function that takes current vertex index (and the previous vertex index if available) and returns the next vertex index by sampling from a discrete random distribution based on the transition probabilities that are calculated on-the-fly. Note: The returned function is used by the ``simulate_walks`` method. """ data = self.data nonzero = self.nonzero p = self.p q = self.q get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs() @njit(nogil=True) def move_forward(cur_idx, prev_idx=None): """Move to next node.""" normalized_probs = get_normalized_probs( data, nonzero, p, q, cur_idx, prev_idx, noise_thresholds, ) cdf = np.cumsum(normalized_probs) choice = np.searchsorted(cdf, np.random.random()) nbrs = np.where(nonzero[cur_idx])[0] return nbrs[choice] return move_forward @njit(nogil=True) def alias_setup(probs): """Construct alias lookup table. This code is modified from the blog post here: https://lips.cs.princeton.edu/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ , where you can find more details about how the method works. In general, the alias method improves the time complexity of sampling from a discrete random distribution to O(1) if the alias table is setup in advance. Args: probs (list(float32)): normalized transition probabilities array, could be in either list or NDArray, of float32 values. """ k = probs.size q = np.zeros(k, dtype=np.float32) j = np.zeros(k, dtype=np.uint32) smaller = np.zeros(k, dtype=np.uint32) larger = np.zeros(k, dtype=np.uint32) smaller_ptr = 0 larger_ptr = 0 for kk in range(k): q[kk] = k * probs[kk] if q[kk] < 1.0: smaller[smaller_ptr] = kk smaller_ptr += 1 else: larger[larger_ptr] = kk larger_ptr += 1 while (smaller_ptr > 0) & (larger_ptr > 0): smaller_ptr -= 1 small = smaller[smaller_ptr] larger_ptr -= 1 large = larger[larger_ptr] j[small] = large q[large] = q[large] + q[small] - 1.0 if q[large] < 1.0: smaller[smaller_ptr] = large smaller_ptr += 1 else: larger[larger_ptr] = large larger_ptr += 1 return j, q @njit(nogil=True) def alias_draw(j, q): """Draw sample from a non-uniform discrete distribution using alias sampling.""" k = j.size kk = np.random.randint(k) if np.random.rand() < q[kk]: return kk else: return j[kk] ================================================ FILE: src/pecanpy/rw/__init__.py ================================================ """Graph objects equipped with random walk transition functions.""" from .dense_rw import DenseRWGraph from .sparse_rw import SparseRWGraph __all__ = ["DenseRWGraph", "SparseRWGraph"] ================================================ FILE: src/pecanpy/rw/dense_rw.py ================================================ """Dense Graph object equipped with random walk computation.""" import numpy as np from numba import njit from ..graph import DenseGraph class DenseRWGraph(DenseGraph): """Dense Graph object equipped with random walk computation.""" def get_noise_thresholds(self): """Compute average edge weights.""" noise_threshold_ary = np.zeros(self.num_nodes, dtype=np.float32) for i in range(self.num_nodes): weights = self.data[i, self.nonzero[i]] noise_threshold_ary[i] = weights.mean() + self.gamma * weights.std() noise_threshold_ary = np.maximum(noise_threshold_ary, 0) return noise_threshold_ary def get_has_nbrs(self): """Wrap ``has_nbrs``.""" nonzero = self.nonzero @njit(nogil=True) def has_nbrs(idx): for j in range(nonzero.shape[1]): if nonzero[idx, j]: return True return False return has_nbrs @staticmethod @njit(nogil=True) def get_normalized_probs( data, nonzero, p, q, cur_idx, prev_idx, noise_threshold_ary, ): """Calculate node2vec transition probabilities. Calculate 2nd order transition probabilities by first finding the neighbors of the current state that are not reachable from the previous state, and divide the corresponding edge weights by the in-out parameter ``q``. Then divide the edge weight from previous state by the return parameter ``p``. Finally, the transition probabilities are computed by normalizing the biased edge weights. Note: If ``prev_idx`` present, calculate 2nd order biased transition, otherwise calculate 1st order transition. """ nbrs_ind = nonzero[cur_idx] unnormalized_probs = data[cur_idx].copy() if prev_idx is not None: # 2nd order biased walks non_com_nbr = np.logical_and(nbrs_ind, ~nonzero[prev_idx]) non_com_nbr[prev_idx] = False # exclude previous state from out biases unnormalized_probs[non_com_nbr] /= q # apply out biases unnormalized_probs[prev_idx] /= p # apply the return bias unnormalized_probs = unnormalized_probs[nbrs_ind] normalized_probs = unnormalized_probs / unnormalized_probs.sum() return normalized_probs @staticmethod @njit(nogil=True) def get_extended_normalized_probs( data, nonzero, p, q, cur_idx, prev_idx, noise_threshold_ary, ): """Calculate node2vec+ transition probabilities.""" cur_nbrs_ind = nonzero[cur_idx] unnormalized_probs = data[cur_idx].copy() if prev_idx is not None: # 2nd order biased walks prev_nbrs_weight = data[prev_idx].copy() # Note: we assume here the network is undirected, hence the edge # weight connecting the next to prev is the same as the reverse. out_ind = cur_nbrs_ind & (prev_nbrs_weight < noise_threshold_ary) out_ind[prev_idx] = False # exclude previous state from out biases # print("CURRENT: ", cur_idx) # print("INOUT: ", np.where(out_ind)[0]) # print("NUM INOUT: ", out_ind.sum(), "\n") t = prev_nbrs_weight[out_ind] / noise_threshold_ary[out_ind] # optional nonlinear parameterization # b = 1; t = b * t / (1 - (b - 1) * t) # compute out biases alpha = 1 / q + (1 - 1 / q) * t # suppress noisy edges alpha[ unnormalized_probs[out_ind] < noise_threshold_ary[cur_idx] ] = np.minimum(1, 1 / q) unnormalized_probs[out_ind] *= alpha # apply out biases unnormalized_probs[prev_idx] /= p # apply the return bias unnormalized_probs = unnormalized_probs[cur_nbrs_ind] normalized_probs = unnormalized_probs / unnormalized_probs.sum() return normalized_probs ================================================ FILE: src/pecanpy/rw/sparse_rw.py ================================================ """Sparse Graph equipped with random walk computation.""" import numpy as np from numba import boolean from numba import njit from ..graph import SparseGraph class SparseRWGraph(SparseGraph): """Sparse Graph equipped with random walk computation.""" def get_has_nbrs(self): """Wrap ``has_nbrs``.""" indptr = self.indptr @njit(nogil=True) def has_nbrs(idx): return indptr[idx] != indptr[idx + 1] return has_nbrs def get_noise_thresholds(self): """Compute average edge weights.""" data = self.data indptr = self.indptr noise_threshold_ary = np.zeros(self.num_nodes, dtype=np.float32) for i in range(self.num_nodes): noise_threshold_ary[i] = ( data[indptr[i] : indptr[i + 1]].mean() + self.gamma * data[indptr[i] : indptr[i + 1]].std() ) noise_threshold_ary = np.maximum(noise_threshold_ary, 0) return noise_threshold_ary @staticmethod @njit(nogil=True) def get_normalized_probs_first_order(data, indices, indptr, cur_idx): """Calculate first order transition probabilities. Note: This function does NOT check whether p = q = 1, which is the required setup for first order random walk. Need to check before calling this function. """ _, unnormalized_probs = get_nbrs(indptr, indices, data, cur_idx) return unnormalized_probs / unnormalized_probs.sum() @staticmethod @njit(nogil=True) def get_normalized_probs( data, indices, indptr, p, q, cur_idx, prev_idx, noise_threshold_ary, ): """Calculate node2vec transition probabilities. Calculate 2nd order transition probabilities by first finding the neighbors of the current state that are not reachable from the previous state, and divide the corresponding edge weights by the in-out parameter ``q``. Then divide the edge weight from previous state by the return parameter ``p``. Finally, the transition probabilities are computed by normalizing the biased edge weights. Note: If ``prev_idx`` present, calculate 2nd order biased transition, otherwise calculate 1st order transition. """ nbrs_idx, unnormalized_probs = get_nbrs(indptr, indices, data, cur_idx) if prev_idx is not None: # 2nd order biased walk prev_ptr = np.where(nbrs_idx == prev_idx)[0] src_nbrs_idx, src_nbrs_wts = get_nbrs(indptr, indices, data, prev_idx) # Neighbors of current but not previous non_com_nbr = isnotin(nbrs_idx, src_nbrs_idx) non_com_nbr[prev_ptr] = False # exclude prev state from out biases unnormalized_probs[non_com_nbr] /= q # apply out biases unnormalized_probs[prev_ptr] /= p # apply the return bias normalized_probs = unnormalized_probs / unnormalized_probs.sum() return normalized_probs @staticmethod @njit(nogil=True) def get_extended_normalized_probs( data, indices, indptr, p, q, cur_idx, prev_idx, noise_threshold_ary, ): """Calculate node2vec+ transition probabilities.""" nbrs_idx, unnormalized_probs = get_nbrs(indptr, indices, data, cur_idx) if prev_idx is not None: # 2nd order biased walk prev_ptr = np.where(nbrs_idx == prev_idx)[0] src_nbrs_idx, src_nbrs_wts = get_nbrs(indptr, indices, data, prev_idx) out_ind, t = isnotin_extended( nbrs_idx, src_nbrs_idx, src_nbrs_wts, noise_threshold_ary, ) # determine out edges out_ind[prev_ptr] = False # exclude prevstate from out biases # compute out biases alpha = 1 / q + (1 - 1 / q) * t[out_ind] # suppress noisy edges alpha[ unnormalized_probs[out_ind] < noise_threshold_ary[cur_idx] ] = np.minimum(1, 1 / q) unnormalized_probs[out_ind] *= alpha # apply out biases unnormalized_probs[prev_ptr] /= p # apply the return bias normalized_probs = unnormalized_probs / unnormalized_probs.sum() return normalized_probs @njit(nogil=True) def get_nbrs(indptr, indices, data, idx): """Return neighbor indices and weights of a specific node index.""" start_idx, end_idx = indptr[idx], indptr[idx + 1] nbrs_idx = indices[start_idx:end_idx] nbrs_wts = data[start_idx:end_idx].copy() return nbrs_idx, nbrs_wts @njit(nogil=True) def isnotin(ptr_ary1, ptr_ary2): """Find node2vec out edges. The node2vec out edges are determined by non-common neighbors. This function finds out neighbors of node1 that are not neighbors of node2, by picking out values in ``ptr_ary1`` but not in ``ptr_ary2``, which correspond to the neighbor pointers for the current state and the previous state, resp. Note: This function does not remove the index of the previous state. Instead, the index of the previous state will be removed once the indicator is returned to the ``get_normalized_probs``. Args: ptr_ary1 (Uint32Array): array of pointers to the neighbors of the current state ptr_ary2 (Uint32Array): array of pointers to the neighbors of the previous state Returns: Indicator of whether a neighbor of the current state is considered as an "out edge" Example: The values in the two neighbor pointer arrays are sorted ascendingly. The main idea is to scan through ``ptr_ary1`` and compare the values in ``ptr_ary2``. In this way, at most one pass per array is needed to find out the non-common neighbor pointers instead of a nested loop (for each element in ``ptr_ary1``, compare against every element in``ptr_ary2``), which is much slower. Checkout the following example for more intuition. The ``*`` above ``ptr_ary1`` and ``ptr_ary2`` indicate the indices ``idx1`` and ``idx2``, respectively, which keep track of the scanning progress. >>> ptr_ary1 = [1, 2, 5] >>> ptr_ary2 = [1, 5] >>> >>> # iteration1: indicator = [False, True, True] >>> * >>> [1, 2, 5] >>> * >>> [1, 5] >>> >>> # iteration2: indicator = [False, True, True] >>> * >>> [1, 2, 5] >>> * >>> [1, 5] >>> >>> # iteration3: indicator = [False, True, False] >>> * >>> [1, 2, 5] >>> * >>> [1, 5] >>> >>> # end of loop """ indicator = np.ones(ptr_ary1.size, dtype=boolean) idx2 = 0 for idx1 in range(ptr_ary1.size): if idx2 == ptr_ary2.size: # end of ary2 break ptr1 = ptr_ary1[idx1] ptr2 = ptr_ary2[idx2] if ptr1 < ptr2: continue elif ptr1 == ptr2: # found a matching value indicator[idx1] = False idx2 += 1 elif ptr1 > ptr2: # sweep through ptr_ary2 until ptr2 catch up on ptr1 for j in range(idx2, ptr_ary2.size): ptr2 = ptr_ary2[j] if ptr2 == ptr1: indicator[idx1] = False idx2 = j + 1 break elif ptr2 > ptr1: idx2 = j break return indicator @njit(nogil=True) def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, noise_thresholds): """Find node2vec+ out edges. The node2vec+ out edges are determined by considering the edge weights connecting node2 (the potential next state) to the previous state. Unlike node2vec, which only considers neighbors of current state that are not neighbors of the previous state, node2vec+ also considers neighbors of the previous state as out edges if the edge weight is below average. Args: ptr_ary1 (Uint32Array): array of pointers to the neighbors of the current state ptr_ary2 (Uint32Array): array of pointers to the neighbors of the previous state wts_ary2 (Float32Array): array of edge weights of the previous state noise_thresholds (Float32Array): array of noisy edge threshold computed based on the average and the std of the edge weights of each node Return: Indicator of whether a neighbor of the current state is considered as an "out edge", with the corresponding parameters used to fine tune the out biases """ indicator = np.ones(ptr_ary1.size, dtype=boolean) t = np.zeros(ptr_ary1.size, dtype=np.float32) idx2 = 0 for idx1 in range(ptr_ary1.size): if idx2 >= ptr_ary2.size: # end of ary2 break ptr1 = ptr_ary1[idx1] ptr2 = ptr_ary2[idx2] if ptr1 < ptr2: continue elif ptr1 == ptr2: # found a matching value # If connection is not loose, identify as an in-edge if wts_ary2[idx2] >= noise_thresholds[ptr2]: indicator[idx1] = False else: t[idx1] = wts_ary2[idx2] / noise_thresholds[ptr2] idx2 += 1 elif ptr1 > ptr2: # Sweep through ptr_ary2 until ptr2 catch up on ptr1 for j in range(idx2 + 1, ptr_ary2.size): ptr2 = ptr_ary2[j] if ptr2 == ptr1: if wts_ary2[j] >= noise_thresholds[ptr2]: indicator[idx1] = False else: t[idx1] = wts_ary2[j] / noise_thresholds[ptr2] idx2 = j + 1 break elif ptr2 > ptr1: idx2 = j break return indicator, t ================================================ FILE: src/pecanpy/typing.py ================================================ """Type annotations.""" from typing import Any from typing import Callable from typing import Dict from typing import Iterator from typing import List from typing import Optional from typing import Sequence from typing import Tuple from nptyping import Bool from nptyping import Float32 from nptyping import NDArray from nptyping import Shape from nptyping import UInt32 from nptyping import UInt64 from typing_extensions import TypeAlias # Callbacks ################################################################### HasNbrs = Callable[[UInt32], bool] MoveForward = Callable[..., UInt32] # Numpy array types ########################################################### # issue with type alias (https://stackoverflow.com/questions/62073473) Embeddings: TypeAlias = NDArray[Shape["*, *"], Float32] AdjMat: TypeAlias = NDArray[Shape["*, *"], Any] AdjNonZeroMat: TypeAlias = NDArray[Shape["*, *"], Bool] Uint32Array: TypeAlias = NDArray[Shape["*"], UInt32] Uint64Array: TypeAlias = NDArray[Shape["*"], UInt64] Float32Array: TypeAlias = NDArray[Shape["*"], Float32] CSR = Tuple[Uint32Array, Uint32Array, Float32Array] __all__ = [ "AdjMat", "AdjNonZeroMat", "Any", "CSR", "Callable", "Dict", "Embeddings", "Float32Array", "HasNbrs", "Iterator", "List", "MoveForward", "NDArray", "Optional", "Sequence", "Tuple", "Uint32Array", ] ================================================ FILE: src/pecanpy/wrappers.py ================================================ """Wrappers used by pecanpy.""" import time class Timer: """Timer for logging runtime of function.""" def __init__(self, name, verbose=True): self.name = name self.verbose = verbose def __call__(self, func): """Call timer decorator.""" def wrapper(*args, **kwargs): start = time.time() result = func(*args, **kwargs) duration = time.time() - start hrs = int(duration // 3600) mins = int(duration % 3600 // 60) secs = duration % 60 print(f"Took {hrs:02d}:{mins:02d}:{secs:05.2f} to {self.name}") return result return wrapper if self.verbose else func ================================================ FILE: test/test_cli.py ================================================ import os import os.path as osp import shutil import subprocess import tempfile import unittest from unittest.mock import patch from numba import set_num_threads from parameterized import parameterized from pecanpy import cli set_num_threads(1) DATA_DIR = osp.abspath(osp.join(__file__, osp.pardir, osp.pardir, "demo")) EDG_FP = osp.join(DATA_DIR, "karate.edg") TMP_DATA_DIR = tempfile.mkdtemp() CSR_FP = osp.join(TMP_DATA_DIR, "karate.csr.npz") DENSE_FP = osp.join(TMP_DATA_DIR, "karate.dense.npz") COM = ["pecanpy", "--input", EDG_FP, "--output"] SETTINGS = [ ("FirstOrderUnweighted",), ("PreCompFirstOrder",), ("PreComp",), ("SparseOTF",), ("DenseOTF",), ] class TestCli(unittest.TestCase): @classmethod def setUpClass(cls): subprocess.run(COM + [CSR_FP, "--task", "tocsr"]) subprocess.run(COM + [DENSE_FP, "--task", "todense"]) @classmethod def tearDownClass(cls): shutil.rmtree(TMP_DATA_DIR) @patch( "argparse._sys.argv", ["pecanpy", "--input", "", "--output", os.devnull], ) def setUp(self): self.args = cli.parse_args() self.args.workers = 1 self.args.dimensions = 8 self.args.walk_length = 10 self.args.num_walks = 2 self.g = self.walks = None def tearDown(self): del self.args del self.g del self.walks def execute(self, mode, input_file, p=1, q=1): self.args.mode = mode self.args.input = input_file self.args.p = p self.args.q = q self.g = cli.read_graph(self.args) cli.preprocess(self.g) self.walks = cli.simulate_walks(self.args, self.g) cli.learn_embeddings(self.args, self.walks) def test_firstorderunweighted_catch(self): for p, q in (2, 1), (1, 0.1), (0.1, 0.1): with self.subTest(p=p, q=q): with self.assertRaises(ValueError): self.execute("FirstOrderUnweighted", EDG_FP, p, q) def test_precompfirstorder_catch(self): for p, q in (2, 1), (1, 0.1), (0.1, 0.1): with self.subTest(p=p, q=q): with self.assertRaises(ValueError): self.execute("PreCompFirstOrder", EDG_FP, p, q) @parameterized.expand(SETTINGS) def test_from_edg(self, name): self.execute(name, EDG_FP) @parameterized.expand(SETTINGS) def test_from_npz(self, name): self.execute(name, DENSE_FP if name == "DenseOTF" else CSR_FP) if __name__ == "__main__": unittest.main() ================================================ FILE: test/test_graph.py ================================================ import os import os.path as osp import shutil import tempfile import unittest from itertools import chain import numpy as np import pytest import scipy.sparse from pecanpy.graph import AdjlstGraph from pecanpy.graph import BaseGraph from pecanpy.graph import DenseGraph from pecanpy.graph import SparseGraph MAT = np.array( [ [0, 1, 1], [1, 0, 0], [1, 0, 0], ], dtype=float, ) INDPTR = np.array([0, 2, 3, 4], dtype=np.uint32) INDICES = np.array([1, 2, 0, 0], dtype=np.uint32) DATA = np.array([1.0, 1.0, 1.0, 1.0], dtype=np.float32) ADJLST = [ {1: 1.0, 2: 1.0}, {0: 1.0}, {0: 1.0}, ] IDS = ["a", "b", "c"] IDMAP = {"a": 0, "b": 1, "c": 2} # This test ensures that the node IDs (from edges) are loaded in the correct order # even if they appear to have been loaded in an incorrect order. MAT2 = np.array( [ [0, 1, 0, 0, 0], [1, 0, 1, 1, 0], [0, 1, 0, 0, 0], [0, 1, 0, 0, 1], [0, 0, 0, 1, 0], ], dtype=float, ) INDPTR2 = np.array([0, 1, 4, 5, 7, 8], dtype=np.uint32) INDICES2 = np.array([1, 0, 2, 3, 1, 1, 4, 3], dtype=np.uint32) DATA2 = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32) ADJLST2 = [ {1: 1.0}, {0: 1.0, 2: 1.0, 3: 1.0}, {1: 1.0}, {1: 1.0, 4: 1.0}, {3: 1.0}, ] IDS2 = ["a", "b", "c", "d", "e"] IDMAP2 = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4} # Test asymmetric directed graph loading with node that has no out-going edge MAT3 = np.array( [ [0, 1, 0, 0], [1, 0, 0, 1], [0, 0, 0, 0], [0, 1, 1, 0], ], ) INDPTR3 = np.array([0, 1, 3, 3, 5], dtype=np.uint32) INDICES3 = np.array([1, 0, 3, 1, 2], dtype=np.uint32) DATA3 = np.array([1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32) ADJLST3 = [ {1: 1.0}, {0: 1.0, 3: 1.0}, {}, {1: 1.0, 2: 1.0}, ] IDS3 = ["a", "b", "c", "d"] IDMAP3 = {"a": 0, "b": 1, "c": 2, "d": 3} class TestBaseGraph(unittest.TestCase): def setUp(self): self.g = BaseGraph() self.g.set_node_ids(IDS) def test_set_node_ids(self): self.assertEqual(self.g.nodes, IDS) self.assertEqual(self.g._node_idmap, IDMAP) def test_properties(self): self.assertEqual(self.g.num_nodes, 3) with self.assertRaises(NotImplementedError): self.assertEqual(self.g.num_edges, 4) with self.assertRaises(NotImplementedError): self.assertEqual(self.g.density, 2 / 3) class TestAdjlstGraph(unittest.TestCase): def setUp(self): self.g1 = AdjlstGraph.from_mat(MAT, IDS) self.g2 = AdjlstGraph.from_mat(MAT2, IDS2) self.g3 = AdjlstGraph.from_mat(MAT3, IDS3) def tearDown(self): del self.g1 del self.g2 del self.g3 def test_from_mat(self): self.assertEqual(self.g1._data, ADJLST) self.assertEqual(self.g1.nodes, IDS) self.assertEqual(self.g2._data, ADJLST2) self.assertEqual(self.g2.nodes, IDS2) self.assertEqual(self.g3._data, ADJLST3) self.assertEqual(self.g3.nodes, IDS3) def test_properties(self): self.assertEqual(self.g1.num_nodes, 3) self.assertEqual(self.g1.num_edges, 4) self.assertEqual(self.g1.density, 2 / 3) self.assertEqual(self.g2.num_nodes, 5) self.assertEqual(self.g2.num_edges, 8) self.assertEqual(self.g2.density, 2 / 5) self.assertEqual(self.g3.num_nodes, 4) self.assertEqual(self.g3.num_edges, 5) self.assertEqual(self.g3.density, 5 / 12) def test_edges(self): self.assertEqual( list(self.g1.edges), [ (0, 1, 1), (0, 2, 1), (1, 0, 1), (2, 0, 1), ], ) self.assertEqual( list(self.g2.edges), [ (0, 1, 1), (1, 0, 1), (1, 2, 1), (1, 3, 1), (2, 1, 1), (3, 1, 1), (3, 4, 1), (4, 3, 1), ], ) def test_save(self): expected_results = { (False, "\t"): [ "a\tb\t1.0\n", "a\tc\t1.0\n", "b\ta\t1.0\n", "c\ta\t1.0\n", ], (True, "\t"): [ "a\tb\n", "a\tc\n", "b\ta\n", "c\ta\n", ], (False, ","): [ "a,b,1.0\n", "a,c,1.0\n", "b,a,1.0\n", "c,a,1.0\n", ], (True, ","): [ "a,b\n", "a,c\n", "b,a\n", "c,a\n", ], } tmpdir = tempfile.mkdtemp() tmppath = os.path.join(tmpdir, "test.edg") for unweighted in True, False: for delimiter in ["\t", ","]: self.g1.save( tmppath, unweighted=unweighted, delimiter=delimiter, ) with open(tmppath) as f: expected_result = expected_results[(unweighted, delimiter)] for line, expected_line in zip(f, expected_result): self.assertEqual(line, expected_line) shutil.rmtree(tmpdir) class TestSparseGraph(unittest.TestCase): def tearDown(self): del self.g1 del self.g2 del self.g3 def validate(self): self.assertTrue(np.all(self.g1.indptr == INDPTR)) self.assertTrue(np.all(self.g1.indices == INDICES)) self.assertTrue(np.all(self.g1.data == DATA)) self.assertEqual(self.g1.nodes, IDS) self.assertEqual(self.g1.num_nodes, 3) self.assertEqual(self.g1.num_edges, 4) self.assertEqual(self.g1.density, 2 / 3) self.assertTrue(np.all(self.g2.indptr == INDPTR2)) self.assertTrue(np.all(self.g2.indices == INDICES2)) self.assertTrue(np.all(self.g2.data == DATA2)) self.assertEqual(self.g2.nodes, IDS2) self.assertEqual(self.g2.num_nodes, 5) self.assertEqual(self.g2.num_edges, 8) self.assertEqual(self.g2.density, 2 / 5) self.assertTrue(np.all(self.g3.indptr == INDPTR3)) self.assertTrue(np.all(self.g3.indices == INDICES3)) self.assertTrue(np.all(self.g3.data == DATA3)) self.assertEqual(self.g3.nodes, IDS3) self.assertEqual(self.g3.num_nodes, 4) self.assertEqual(self.g3.num_edges, 5) self.assertEqual(self.g3.density, 5 / 12) def test_from_mat(self): self.g1 = SparseGraph.from_mat(MAT, IDS) self.g2 = SparseGraph.from_mat(MAT2, IDS2) self.g3 = SparseGraph.from_mat(MAT3, IDS3) self.validate() def test_from_adjlst_graph(self): self.g1 = SparseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT, IDS)) self.g2 = SparseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT2, IDS2)) self.g3 = SparseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT3, IDS3)) self.validate() class TestDenseGraph(unittest.TestCase): def tearDown(self): del self.g1 del self.g2 def validate(self): self.assertTrue(np.all(self.g1.data == MAT)) self.assertEqual(self.g1.nodes, IDS) self.assertEqual(self.g1.num_nodes, 3) self.assertEqual(self.g1.num_edges, 4) self.assertEqual(self.g1.density, 2 / 3) self.assertTrue(np.all(self.g2.data == MAT2)) self.assertEqual(self.g2.nodes, IDS2) self.assertEqual(self.g2.num_nodes, 5) self.assertEqual(self.g2.num_edges, 8) self.assertEqual(self.g2.density, 2 / 5) self.assertTrue(np.all(self.g3.data == MAT3)) self.assertEqual(self.g3.nodes, IDS3) self.assertEqual(self.g3.num_nodes, 4) self.assertEqual(self.g3.num_edges, 5) self.assertEqual(self.g3.density, 5 / 12) def test_from_mat(self): self.g1 = DenseGraph.from_mat(MAT, IDS) self.g2 = DenseGraph.from_mat(MAT2, IDS2) self.g3 = DenseGraph.from_mat(MAT3, IDS3) self.validate() def test_from_adjlst_graph(self): self.g1 = DenseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT, IDS)) self.g2 = DenseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT2, IDS2)) self.g3 = DenseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT3, IDS3)) self.validate() @pytest.mark.usefixtures("karate_graph_converted") def test_csr_from_scipy(tmpdir): tmp_karate_csr_path = osp.join(tmpdir, "karate.csr.npz") print(f"Temporary karate CSR will be saved under {tmp_karate_csr_path}") # Save karate CSR using scipy.sparse.csr edgelist = np.loadtxt(pytest.KARATE_ORIG_PATH).astype(int) - 1 edgelist = np.vstack((edgelist, edgelist[:, [1, 0]])).T # to undirected num_nodes = edgelist.max() + 1 csr = scipy.sparse.csr_matrix( (np.ones(edgelist.shape[1]), ([edgelist[0], edgelist[1]])), shape=(num_nodes, num_nodes), ) scipy.sparse.save_npz(tmp_karate_csr_path, csr) # Load scipy CSR and compare with PecanPy CSR scipy_csr_graph, pecanpy_graph = SparseGraph(), AdjlstGraph() scipy_csr_graph.read_npz(tmp_karate_csr_path, weighted=False) pecanpy_graph.read(pytest.KARATE_ORIG_PATH, weighted=False, directed=False) # Assert graph size (number of nodes) assert scipy_csr_graph.num_nodes == pecanpy_graph.num_nodes # Assert neighborhood sizes scipy_csr_nbhd_sizes = scipy_csr_graph.indptr[1:] - scipy_csr_graph.indptr[:-1] for scipy_node_idx in range(scipy_csr_graph.num_nodes): pecanpy_node_idx = pecanpy_graph.get_node_idx(str(scipy_node_idx + 1)) assert scipy_csr_nbhd_sizes[scipy_node_idx] == len( pecanpy_graph._data[pecanpy_node_idx], ) @pytest.mark.usefixtures("karate_graph_converted") @pytest.mark.parametrize("implicit_ids", [True, False]) @pytest.mark.parametrize("graph_factory", [SparseGraph, DenseGraph]) def test_implicit_ids(implicit_ids, graph_factory): graph_path = ( pytest.KARATE_CSR_PATH if graph_factory == SparseGraph else pytest.KARATE_DENSE_PATH ) ref_ids = pytest.KARATE_IMPLICIT_IDS if implicit_ids else pytest.KARATE_NODE_IDS g = graph_factory() g.read_npz(graph_path, weighted=False, implicit_ids=implicit_ids) assert sorted(g.nodes) == sorted(ref_ids) @pytest.fixture(scope="module") def karate_graph_converted(pytestconfig, tmpdir_factory): tmpdir = tmpdir_factory.mktemp("test_graph") pytest.KARATE_ORIG_PATH = osp.join(pytestconfig.rootpath, "demo/karate.edg") pytest.KARATE_CSR_PATH = osp.join(tmpdir, "karate.csr.npz") pytest.KARATE_DENSE_PATH = osp.join(tmpdir, "karate.dense.npz") # Load karate node ids karate_edgelist = np.loadtxt(pytest.KARATE_ORIG_PATH, dtype=str).tolist() pytest.KARATE_NODE_IDS = list(set(chain.from_iterable(karate_edgelist))) pytest.KARATE_IMPLICIT_IDS = list(map(str, range(len(pytest.KARATE_NODE_IDS)))) # Load karate graph and save csr.npz and dense.npz g = AdjlstGraph() g.read(pytest.KARATE_ORIG_PATH, weighted=False, directed=False) SparseGraph.from_adjlst_graph(g).save(pytest.KARATE_CSR_PATH) DenseGraph.from_adjlst_graph(g).save(pytest.KARATE_DENSE_PATH) del g yield if __name__ == "__main__": unittest.main() ================================================ FILE: test/test_pecanpy.py ================================================ import os.path as osp import unittest from numba import set_num_threads from parameterized import parameterized from pecanpy import graph from pecanpy import pecanpy set_num_threads(1) DATA_DIR = osp.abspath(osp.join(__file__, osp.pardir, osp.pardir, "demo")) EDG_FP = osp.join(DATA_DIR, "karate.edg") SETTINGS = [ ("SparseOTF", pecanpy.SparseOTF), ("DenseOTF", pecanpy.DenseOTF), ("PreComp", pecanpy.PreComp), ("PreCompFirstOrder", pecanpy.PreCompFirstOrder), ("FirstOrderUnweighted", pecanpy.FirstOrderUnweighted), ] class TestPecanPy(unittest.TestCase): @classmethod def setUpClass(self): g = graph.DenseGraph() g.read_edg(EDG_FP, weighted=False, directed=False) self.mat = g.data self.ids = g.nodes @parameterized.expand(SETTINGS) def test_from_mat(self, name, mode): with self.subTest(name): g = mode.from_mat(self.mat, self.ids, p=1, q=1) g.embed() @parameterized.expand(SETTINGS) def test_from_edg(self, name, mode): with self.subTest(name): g = mode(p=1, q=1) g.read_edg(EDG_FP, weighted=False, directed=False) g.embed() if __name__ == "__main__": unittest.main() ================================================ FILE: test/test_walk.py ================================================ import unittest import numpy as np from numba import set_num_threads from parameterized import parameterized from pecanpy import pecanpy set_num_threads(1) MAT = np.array( [ [0, 1, 0, 0, 0], [1, 0, 1, 0, 0], [0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 0, 1, 1, 0], ], ) IDS = ["a", "b", "c", "d", "e"] WALKS = { "FirstOrderUnweighted": [ ["c", "b", "c", "d"], ["d", "c", "d", "e"], ["e", "d", "c", "b"], ["e", "d", "c", "b"], ["b", "a", "b", "a"], ["b", "a", "b", "c"], ["c", "e", "d", "e"], ["d", "c", "b", "c"], ["a", "b", "c", "d"], ["a", "b", "c", "b"], ], "PreCompFirstOrder": [ ["c", "d", "e", "d"], ["d", "c", "d", "e"], ["e", "d", "c", "e"], ["e", "d", "e", "c"], ["b", "c", "e", "c"], ["b", "c", "d", "c"], ["c", "d", "e", "d"], ["d", "c", "e", "d"], ["a", "b", "a", "b"], ["a", "b", "c", "e"], ], "PreComp": [ ["c", "d", "e", "d"], ["d", "c", "d", "e"], ["e", "d", "c", "e"], ["e", "d", "e", "c"], ["b", "c", "e", "c"], ["b", "c", "d", "c"], ["c", "d", "e", "d"], ["d", "c", "e", "d"], ["a", "b", "a", "b"], ["a", "b", "c", "e"], ], "SparseOTF": [ ["c", "d", "e", "d"], ["d", "e", "c", "d"], ["e", "c", "e", "d"], ["e", "c", "e", "d"], ["b", "c", "e", "c"], ["b", "a", "b", "c"], ["c", "e", "d", "e"], ["d", "e", "c", "e"], ["a", "b", "c", "b"], ["a", "b", "c", "d"], ], "DenseOTF": [ ["c", "d", "e", "d"], ["d", "e", "c", "d"], ["e", "c", "e", "d"], ["e", "c", "e", "d"], ["b", "c", "e", "c"], ["b", "a", "b", "c"], ["c", "e", "d", "e"], ["d", "e", "c", "e"], ["a", "b", "c", "b"], ["a", "b", "c", "d"], ], } class TestWalk(unittest.TestCase): @parameterized.expand( [ ("FirstOrderUnweighted", pecanpy.FirstOrderUnweighted), ("PreCompFirstOrder", pecanpy.PreComp), ("PreComp", pecanpy.PreComp), ("SparseOTF", pecanpy.SparseOTF), ("DenseOTF", pecanpy.DenseOTF), ], ) def test_first_order_unweighted(self, name, mode): graph = mode.from_mat(MAT, IDS, p=1, q=1, random_state=0) walks = graph.simulate_walks(2, 3) self.assertEqual(walks, WALKS[name]) print(walks) if __name__ == "__main__": unittest.main() ================================================ FILE: tox.ini ================================================ [tox] minversion = 3.8.0 envlist = python3.8 python3.9 python3.10 python3.11 flake8 mypy isolated_build = true [gh-actions] python = 3.8: python3.8, flake8 3.9: python3.9 3.10: python3.10 3.11: python3.11 [testenv] setenv = PYTHONPATH = {toxinidir} deps = -r{toxinidir}/requirements.txt .[dev] commands = pytest --basetemp={envtmpdir} test/ [testenv:mypy] skip_install = true deps = mypy numpy commands = mypy src/pecanpy [testenv:flake8] skip_install = true deps = flake8 # flake8-bandit flake8-builtins flake8-bugbear flake8-colors flake8-commas flake8-comprehensions flake8-docstrings flake8-import-order flake8-use-fstring pep8-naming pydocstyle commands = flake8 src/pecanpy/ description = Run the flake8 tool with several plugins (bandit, docstrings, import order, pep8 naming). [flake8] max-line-length = 88 extend-ignore = A005 E203 # current limitation of nptyping https://github.com/ramonhagenaars/nptyping/issues/63 F722 # init param docstring in class docstring D107 exclude = .tox, .git, __pycache__, build, dist, *.pyc, *.egg-info, .cache, .eggs import-order-style = pycharm application-import-names = pybel bel_resources tests