Repository: krishnanlab/PecanPy
Branch: master
Commit: 743196280f33
Files: 36
Total size: 116.0 KB
Directory structure:
gitextract_5ev1jrt4/
├── .bumpversion.cfg
├── .github/
│ ├── dependabot.yml
│ └── workflows/
│ ├── release.yml
│ └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── LICENSE
├── README.md
├── demo/
│ ├── karate.edg
│ ├── reproducibility.sh
│ └── run_pecanpy
├── docs/
│ ├── Makefile
│ ├── requirements.txt
│ └── source/
│ ├── conf.py
│ ├── index.rst
│ └── pecanpy.rst
├── pyproject.toml
├── requirements.txt
├── setup.cfg
├── setup.py
├── src/
│ └── pecanpy/
│ ├── __init__.py
│ ├── cli.py
│ ├── experimental.py
│ ├── graph.py
│ ├── pecanpy.py
│ ├── rw/
│ │ ├── __init__.py
│ │ ├── dense_rw.py
│ │ └── sparse_rw.py
│ ├── typing.py
│ └── wrappers.py
├── test/
│ ├── test_cli.py
│ ├── test_graph.py
│ ├── test_pecanpy.py
│ └── test_walk.py
└── tox.ini
================================================
FILE CONTENTS
================================================
================================================
FILE: .bumpversion.cfg
================================================
[bumpversion]
current_version = 2.0.10-dev
tag = False
commit = True
message = bump version: {current_version} -> {new_version}
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)([-](?P<release>(dev|stable)+)(?P<build>\d*))?
serialize =
{major}.{minor}.{patch}-{release}{build}
{major}.{minor}.{patch}-{release}
{major}.{minor}.{patch}
[bumpversion:part:release]
optional_value = stable
values =
dev
stable
[bumpversion:file:setup.cfg]
search = version = {current_version}
replace = version = {new_version}
[bumpversion:file:src/pecanpy/__init__.py]
search = __version__ = "{current_version}"
replace = version = "{new_version}"
[bumpversion:file:docs/source/conf.py]
search = release = "{current_version}"
replace = release = "{new_version}"
================================================
FILE: .github/dependabot.yml
================================================
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/" # Location of package manifests
schedule:
interval: "daily"
ignore:
- dependency-name: "numpy"
versions: ["1.22.x"] # Numba 0.55.1 do not support numpy 1.22.x yet https://github.com/numba/numba/issues/7754
================================================
FILE: .github/workflows/release.yml
================================================
name: Release Package
on:
release:
types: [created]
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install setuptools wheel twine
- name: Build and publish
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
python setup.py sdist bdist_wheel
twine upload dist/*
================================================
FILE: .github/workflows/tests.yml
================================================
name: Tests
on:
- push
- pull_request
jobs:
test:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
python-version: ['3.8', '3.9', '3.10', '3.11']
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install tox tox-gh-actions
- name: Test with tox
run: tox
================================================
FILE: .gitignore
================================================
# vim buffer
*.swp
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# IDEA
.idea/
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: trailing-whitespace
exclude: .bumpversion.cfg
- id: end-of-file-fixer
- repo: https://github.com/asottile/reorder-python-imports
rev: v3.12.0
hooks:
- id: reorder-python-imports
args: ["--py38-plus"]
- repo: https://github.com/asottile/add-trailing-comma
rev: v3.1.0
hooks:
- id: add-trailing-comma
- repo: https://github.com/asottile/pyupgrade
rev: v3.15.0
hooks:
- id: pyupgrade
- repo: https://github.com/psf/black
rev: 23.12.1
hooks:
- id: black
args: [--safe]
================================================
FILE: .readthedocs.yml
================================================
# Read the Docs configuration file
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.8"
sphinx:
configuration: docs/source/conf.py
python:
install:
- requirements: docs/requirements.txt
- requirements: requirements.txt
================================================
FILE: LICENSE
================================================
BSD 3-Clause License
Copyright (c) 2020-2021, Krishnan Laboratory, Michigan State University.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: README.md
================================================
[](https://doi.org/10.5281/zenodo.6386437)
[](https://pecanpy.readthedocs.io/en/latest/?badge=latest)
[](https://github.com/psf/black)
[](https://github.com/krishnanlab/PecanPy/actions/workflows/tests.yml)
# PecanPy: A parallelized, efficient, and accelerated _node2vec(+)_ in Python
Learning low-dimensional representations (embeddings) of nodes in large graphs is key to applying machine learning on massive biological networks. _Node2vec_ is the most widely used method for node embedding. PecanPy is a fast, parallelized, memory efficient, and cache optimized Python implementation of [_node2vec_](https://github.com/aditya-grover/node2vec). It uses cache-optimized compact graph data structures and precomputing/parallelization to result in fast, high-quality node embeddings for biological networks of all sizes and densities. Detailed source code documentation can be found [here](https://pecanpy.readthedocs.io/).
The details of implementation and the optimizations, along with benchmarks, are described in the application note [_PecanPy: a fast, efficient and parallelized Python implementation of node2vec_](https://doi.org/10.1093/bioinformatics/btab202), which is published in _Bioinformatics_. The benchmarking results presented in the preprint can be reproduced using the test scripts provided in the companion [benchmarks repo](https://github.com/krishnanlab/PecanPy_benchmarks).
**v2 update**: PecanPy is now equipped with _node2vec+_, which is a natural extension of _node2vec_ and handles weighted graph more effectively. For more information, see [*Accurately Modeling Biased Random Walks on Weighted Graphs Using Node2vec+*](https://arxiv.org/abs/2109.08031). The datasets and test scripts for reproducing the presented results are available in the [node2vec+ benchmarks repo](https://github.com/krishnanlab/node2vecplus_benchmarks).
## Installation
Install from the latest release with:
```bash
$ pip install pecanpy
```
Install latest version (unreleassed) in development mode with:
```bash
$ git clone https://github.com/krishnanlab/pecanpy.git
$ cd pecanpy
$ pip install -e .
```
where `-e` means "editable" mode so you don't have to reinstall every time you make changes.
PecanPy installs a command line utility `pecanpy` that can be used directly.
## Usage
PecanPy operates in three different modes – `PreComp`, `SparseOTF`, and `DenseOTF` – that are optimized for networks of different sizes and densities; `PreComp` for networks that are small (≤10k nodes; any density), `SparseOTF` for networks that are large and sparse (>10k nodes; ≤10% of edges), and `DenseOTF` for networks that are large and dense (>10k nodes; >10% of edges). These modes appropriately take advantage of compact/dense graph data structures, precomputing transition probabilities, and computing 2nd-order transition probabilities during walk generation to achieve significant improvements in performance.
### Example
To run *node2vec* on Zachary's karate club network using `SparseOTF` mode, execute the following command from the project home directory:
```bash
pecanpy --input demo/karate.edg --output demo/karate.emb --mode SparseOTF
```
### Node2vec+
To enable _node2vec+_, specify the `--extend` option.
```bash
pecanpy --input demo/karate.edge --output demo/karate_n2vplus.emb --mode SparseOTF --extend
```
**Note**: _node2vec+_ is only beneficial for embedding _weighted_ graphs. For unweighted graphs, _node2vec+_ is equivalent to _node2vec_. The above example only serves as a demonstration of enabling _node2vec+_.
### Demo
Execute the following command for full demonstration:
```bash
sh demo/run_pecanpy
```
### Mode
As mentioned above, PecanPy contains three main modes for generating node2vec random walks,
each of which is better optimized for different network sizes/densities:
| Mode | Network size/density | Optimization |
|:-----|:---------------------|:-------------|
| `PreComp` | <10k nodes, <0.1% edges | Precompute second order transition probabilities, using CSR graph |
| `SparseOTF` (default) | (≥10k nodes, ≥0.1% and <20% of edges) or (<10k nodes, ≥0.1% edges) | Transition probabilites computed on-the-fly, using CSR graph |
| `DenseOTF` | >20% of edges | Transition probabilities computed on-the-fly, using dense matrix |
#### Compatibility and recommendations
| Mode | Weighted | ``p,q!=1`` | Node2vec+ | Speed | Use this if |
|:-----|----------------|---------------|-----------|:------------|:--------|
|``PreComp``|:white_check_mark:|:white_check_mark:|:white_check_mark:|:dash::dash:|The graph is small and sparse|
|``SparseOTF``|:white_check_mark:|:white_check_mark:|:white_check_mark:|:dash:|The graph is sparse but not necessarily small|
|``DenseOTF``|:white_check_mark:|:white_check_mark:|:white_check_mark:|:dash:|The graph is extremely dense|
|``PreCompFirstOrder``|:white_check_mark:|:x:|:x:|:dash::dash:|Run with ``p = q = 1`` on weighted graph|
|``FirstOrderUnweighted``|:x:|:x:|:x:|:dash::dash::dash:|Run with ``p = q = 1`` on unweighted graph|
### Options
Check out the full list of options available using:
```bash
pecanpy --help
```
### Input
The supported input is a network file as an edgelist `.edg` file (node id could be int or string):
```
node1_id node2_id <weight_float, optional>
```
Another supported input format (only for `DenseOTF`) is the numpy array `.npz` file. Run the following command to prepare a `.npz` file from a `.edg` file.
```bash
pecanpy --input $input_edgelist --output $output_npz --task todense
```
The default delimiter for `.edg` is tab space (`\t`), you many change this by passing in the `--delimiter` option.
### Output
The output file has *n+1* lines for graph with *n* vertices, with a header line of the following format:
```
num_of_nodes dim_of_representation
```
The following next *n* lines are the representations of dimension *d* following the corresponding node ID:
```
node_id dim_1 dim_2 ... dim_d
```
### Development Note
Run `black src/pecanpy/` to automatically follow black code formatting.
Run `tox -e flake8` and resolve suggestions before committing to ensure consistent code style.
## Additional Information
### Documentation
Detailed documentation for PecanPy is available [here](https://pecanpy.readthedocs.io/).
### Support
For support, please consider opening a GitHub issue and we will do our best to reply in a timely manner.
Alternatively, if you would like to keep the conversation private, feel free to contact [Remy Liu](https://twitter.com/RemyLau3) at liurenmi@msu.edu.
### License
This repository and all its contents are released under the [BSD 3-Clause License](https://opensource.org/licenses/BSD-3-Clause); See [LICENSE.md](https://github.com/krishnanlab/pecanpy/blob/master/LICENSE.md).
### Citation
If you use PecanPy, please cite:
Liu R, Krishnan A (2021) **PecanPy: a fast, efficient, and parallelized Python implementation of _node2vec_.** _Bioinformatics_ https://doi.org/10.1093/bioinformatics/btab202
If you find _node2vec+_ useful, please cite:
Liu R, Hirn M, Krishnan A (2023) **Accurately modeling biased random walks on weighted graphs using _node2vec+_.** _Bioinformatics_ https://doi.org/10.1093/bioinformatics/btad047
### Authors
Renming Liu, Arjun Krishnan*
>\*General correspondence should be addressed to AK at arjun.krishnan@cuanschutz.edu.
### Funding
This work was primarily supported by US National Institutes of Health (NIH) grants R35 GM128765 to AK and in part by MSU start-up funds to AK.
### Acknowledgements
We thank [Christopher A. Mancuso](https://github.com/ChristopherMancuso), [Anna Yannakopoulos](http://yannakopoulos.com/), and the rest of the [Krishnan Lab](https://www.thekrishnanlab.org/team) for valuable discussions and feedback on the software and manuscript. Thanks to [Charles T. Hoyt](https://github.com/cthoyt) for making the software `pip` installable and for an extensive code review.
### References
**Original _node2vec_**
* Grover, A. and Leskovec, J. (2016) node2vec: Scalable Feature Learning for Networks. ArXiv160700653 Cs Stat.
Original _node2vec_ software and networks
* https://snap.stanford.edu/node2vec/ contains the original software and the networks (PPI, BlogCatalog, and Wikipedia) used in the original study (Grover and Leskovec, 2016).
**Other networks**
* Stark, C. et al. (2006) BioGRID: a general repository for interaction datasets. Nucleic Acids Res., 34, D535–D539.
* BioGRID human protein-protein interactions.
* Szklarczyk, D. et al. (2015) STRING v10: protein–protein interaction networks, integrated over the tree of life. Nucleic Acids Res., 43, D447–D452.
* STRING predicted human gene interactions.
* Greene, C.S. et al. (2015) Understanding multicellular function and disease with human tissue-specific networks. Nat. Genet., 47, 569–576.
* GIANT-TN is a generic genome-scale human gene network. GIANT-TN-c01 is a sub-network of GIANT-TN where edges with edge weight below 0.01 are discarded.
BioGRID (Stark et al., 2006), STRING (Szklarczyk et al., 2015), and GIANT-TN (Greene et al., 2015) are available from https://doi.org/10.5281/zenodo.3352323.
* Law, J.N. et al. (2019) Accurate and Efficient Gene Function Prediction using a Multi-Bacterial Network. bioRxiv, 646687.
* SSN200 is a cross-species network of proteins from 200 species with the edges representing protein sequence similarities. Downloaded from https://bioinformatics.cs.vt.edu/~jeffl/supplements/2019-fastsinksource/.
================================================
FILE: demo/karate.edg
================================================
1 32
1 22
1 20
1 18
1 14
1 13
1 12
1 11
1 9
1 8
1 7
1 6
1 5
1 4
1 3
1 2
2 31
2 22
2 20
2 18
2 14
2 8
2 4
2 3
3 14
3 9
3 10
3 33
3 29
3 28
3 8
3 4
4 14
4 13
4 8
5 11
5 7
6 17
6 11
6 7
7 17
9 34
9 33
9 33
10 34
14 34
15 34
15 33
16 34
16 33
19 34
19 33
20 34
21 34
21 33
23 34
23 33
24 30
24 34
24 33
24 28
24 26
25 32
25 28
25 26
26 32
27 34
27 30
28 34
29 34
29 32
30 34
30 33
31 34
31 33
32 34
32 33
33 34
================================================
FILE: demo/reproducibility.sh
================================================
#!/bin/bash --login
# reproducibility.sh
# Test the reproducibility of PecanPy between runs.
source ~/.bashrc
rs=100
export PYTHONHASHSEED=$rs
conda activate pecanpy-dev
pecanpy --input karate.edg --output karate1.emd --mode FirstOrderUnweighted --workers 1 --random_state $rs
pecanpy --input karate.edg --output karate2.emd --mode FirstOrderUnweighted --workers 1 --random_state $rs
cmp karate1.emd karate2.emd
rm -f karate1.emd karate2.emd
================================================
FILE: demo/run_pecanpy
================================================
#!/bin/bash
cd $(dirname $(realpath $0))
cd ../
set -v
# run with PreComp mode (default)
pecanpy --input demo/karate.edg --output demo/karate.emb --verbose
# run with SparseOTF mode
pecanpy --input demo/karate.edg --output demo/karate.emb --verbose --mode SparseOTF
# run with DenseOTF mode
pecanpy --input demo/karate.edg --output demo/karate.emb --verbose --mode DenseOTF
# convert and save edgelist as dense matrix
pecanpy --input demo/karate.edg --output demo/karate.npz --task todense
# run with DenseOTF mode using dense array as input
pecanpy --input demo/karate.npz --output demo/karate.emb --verbose --mode DenseOTF
# input parameters
pecanpy --help
================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/requirements.txt
================================================
sphinx
sphinx_rtd_theme
================================================
FILE: docs/source/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath("../../src"))
# -- Project information -----------------------------------------------------
project = "PecanPy"
copyright = "2020, Renming Liu and Arjun Krishnan"
author = "Renming Liu and Arjun Krishnan"
# The full version, including alpha/beta/rc tags
release = "2.0.10-dev"
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.todo",
"sphinx.ext.coverage",
"sphinx.ext.viewcode",
"sphinx.ext.napoleon",
]
# Napoleon settings
napoleon_google_docstring = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
"python": ("https://docs.python.org/3", None),
"networkx": ("https://networkx.github.io/documentation/latest/", None),
}
autodoc_member_order = "bysource"
autoclass_content = "both"
================================================
FILE: docs/source/index.rst
================================================
Welcome to PecanPy's documentation
==================================
.. toctree::
:maxdepth: 2
pecanpy
================================================
FILE: docs/source/pecanpy.rst
================================================
PecanPy package
===============
Command line interface
----------------------
.. automodule:: pecanpy.cli
:members:
:undoc-members:
:show-inheritance:
Graph Data Structures
---------------------
.. automodule:: pecanpy.graph
:members:
:undoc-members:
:show-inheritance:
Node2vec implementations
------------------------
.. automodule:: pecanpy.pecanpy
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools>=42.0", "wheel"]
build-backend = "setuptools.build_meta"
[tool.mypy]
ignore_missing_imports = true
follow_imports = "skip"
plugins = [
"numpy.typing.mypy_plugin",
]
================================================
FILE: requirements.txt
================================================
gensim==4.3.2
nptyping==2.5.0
numba-progress==1.1.0
numba==0.58.1
numpy==1.23.2
scipy<1.13 # triu import issue (https://stackoverflow.com/a/78279318/12519564)
typing_extensions==4.13.2
================================================
FILE: setup.cfg
================================================
[metadata]
name = pecanpy
version = 2.0.10-dev
author = Remy Liu
author_email = liurenmi@msu.edu
description = A parallelized, efficient, and accelerated node2vec
long_description = file: README.md
long_description_content_type = text/markdown
# Links
url = https://github.com/krishnanlab/PecanPy
project_urls =
Documentation = https://pecanpy.readthedocs.io/
# License
license_files = file: LICENSE
license = BSD 3-Clause License
# Search tags
classifiers =
Development Status :: 5 - Production/Stable
Programming Language :: Python
Programming Language :: Python :: 3 :: Only
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
keywords =
Network Embedding
[options]
install_requires =
gensim>=4.1.0
numpy>=1.20.0
numba>=0.46.0
numba-progress>=0.0.2
nptyping>=2.0.0
typing_extensions>=4.0.1
zip_safe = false
include_package_data = True
python_requires = >=3.8
# Where is my code
packages = find:
package_dir =
= src
[options.extras_require]
dev =
bump2version==1.0.1
mypy==1.9.0
parameterized==0.9.0
pre-commit==3.5.0; python_version < "3.9"
pre-commit==4.2.0; python_version >= "3.9"
pytest-cov==5.0.0
pytest-xdist==3.6.1
pytest==8.3.5
tox==4.25.0
[options.packages.find]
where = src
[options.entry_points]
console_scripts =
pecanpy = pecanpy.cli:main
================================================
FILE: setup.py
================================================
"""Setup module."""
import setuptools
if __name__ == "__main__":
setuptools.setup()
================================================
FILE: src/pecanpy/__init__.py
================================================
"""PecanPy: parallelized, efficient, and accelerated node2vec."""
from . import graph
from . import pecanpy
version = "2.0.10-dev"
__all__ = ["graph", "pecanpy"]
================================================
FILE: src/pecanpy/cli.py
================================================
"""Command line utility for PecanPy.
This is the command line interface for the ``pecanpy`` package.
Examples:
Run PecanPy in command line using ``PreComp`` mode to embed the karate network::
$ pecanpy --input demo/karate.edg --ouptut demo/karate.emb --mode PreComp
Checkout the full list of parameters by::
$ pecanpy --help
"""
import argparse
import warnings
import numba
import numpy as np
from gensim.models import Word2Vec
from . import graph
from . import pecanpy
from .wrappers import Timer
def parse_args():
"""Parse node2vec arguments."""
parser = argparse.ArgumentParser(
description="Run pecanpy, a parallelized, efficient, and accelerated "
"Python implementation of node2vec",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--input",
required=True,
help="Input graph (.edg or .npz) file path.",
)
parser.add_argument(
"--output",
required=True,
help="Output embeddings file path. Save as .npz file if the specified "
"file path ends with .npz, otherwise save as a text file using the "
"gensim save_word2vec_format method.",
)
parser.add_argument(
"--task",
default="pecanpy",
choices=["pecanpy", "tocsr", "todense"],
help="Task to be performed.",
)
parser.add_argument(
"--mode",
default="SparseOTF",
choices=[
"DenseOTF",
"FirstOrderUnweighted",
"PreComp",
"PreCompFirstOrder",
"SparseOTF",
],
help="PecanPy execution mode.",
)
parser.add_argument(
"--dimensions",
type=int,
default=128,
help="Number of dimensions.",
)
parser.add_argument(
"--walk-length",
type=int,
default=80,
help="Length of walk per source.",
)
parser.add_argument(
"--num-walks",
type=int,
default=10,
help="Number of walks per source.",
)
parser.add_argument(
"--window-size",
type=int,
default=10,
help="Context size for optimization.",
)
parser.add_argument(
"--epochs",
type=int,
default=1,
help="Number of epochs in SGD when training Word2Vec",
)
parser.add_argument(
"--workers",
type=int,
default=0,
help="Number of parallel workers (0 to use all available threads).",
)
parser.add_argument(
"--p",
type=float,
default=1,
help="Return hyperparameter.",
)
parser.add_argument(
"--q",
type=float,
default=1,
help="Inout hyperparameter.",
)
parser.add_argument(
"--weighted",
action="store_true",
help="Boolean specifying (un)weighted.",
)
parser.add_argument(
"--directed",
action="store_true",
help="Graph is (un)directed.",
)
parser.add_argument(
"--verbose",
action="store_true",
help="Print out training details",
)
parser.add_argument(
"--extend",
action="store_true",
help="Use node2vec+ extension",
)
parser.add_argument(
"--gamma",
type=float,
default=0,
help="Noisy edge threshold parameter.",
)
parser.add_argument(
"--random_state",
type=int,
default=None,
help="Random seed for generating random walks.",
)
parser.add_argument(
"--delimiter",
type=str,
default="\t",
help="Delimiter used between node IDs.",
)
parser.add_argument(
"--implicit_ids",
action="store_true",
help="If set, use canonical node ordering for the node IDs.",
)
return parser.parse_args()
def check_mode(g, args):
"""Check mode selection.
Give recommendation to user for pecanpy mode based on graph size and density.
"""
mode = args.mode
weighted = args.weighted
p = args.p
q = args.q
# Check unweighted first order random walk usage
if mode == "FirstOrderUnweighted":
if not p == q == 1 or weighted:
raise ValueError(
f"FirstOrderUnweighted only works when weighted = False and "
f"p = q = 1, got {weighted=}, {p=}, {q=}",
)
return
if mode != "FirstOrderUnweighted" and p == q == 1 and not weighted:
warnings.warn(
"When p = 1 and q = 1 with unweighted graph, it is highly "
f"recommended to use FirstOrderUnweighted over {mode} (current "
"selection). The runtime could be improved greatly with improved "
"memory usage.",
stacklevel=2,
)
return
# Check first order random walk usage
if mode == "PreCompFirstOrder":
if not p == q == 1:
raise ValueError(
f"PreCompFirstOrder only works when p = q = 1, got {p=}, {q=}",
)
return
if mode != "PreCompFirstOrder" and p == 1 == q:
warnings.warn(
"When p = 1 and q = 1, it is highly recommended to use "
f"PreCompFirstOrder over {mode} (current selection). The runtime "
"could be improved greatly with low memory usage.",
stacklevel=2,
)
return
# Check network density and recommend appropriate mode
g_size = g.num_nodes
g_dens = g.density
if (g_dens >= 0.2) & (mode != "DenseOTF"):
warnings.warn(
f"Network density = {g_dens:.3f} (> 0.2), it is recommended to use "
f"DenseOTF over {mode} (current selection)",
stacklevel=2,
)
if (g_dens < 0.001) & (g_size < 10000) & (mode != "PreComp"):
warnings.warn(
f"Network density = {g_dens:.2e} (< 0.001) with {g_size} nodes "
f"(< 10000), it is recommended to use PreComp over {mode} (current "
"selection)",
stacklevel=2,
)
if (g_dens >= 0.001) & (g_dens < 0.2) & (mode != "SparseOTF"):
warnings.warn(
f"Network density = {g_dens:.3f}, it is recommended to use "
f"SparseOTF over {mode} (current selection)",
stacklevel=2,
)
if (g_dens < 0.001) & (g_size >= 10000) & (mode != "SparseOTF"):
warnings.warn(
f"Network density = {g_dens:.3f} (< 0.001) with {g_size} nodes "
f"(>= 10000), it is recommended to use SparseOTF over {mode} "
"(current selection)",
stacklevel=2,
)
@Timer("load Graph")
def read_graph(args):
"""Read input network to memory.
Depending on the mode selected, reads the network either in CSR
representation (``PreComp`` and ``SparseOTF``) or 2d numpy array
(``DenseOTF``).
"""
path = args.input
output = args.output
p = args.p
q = args.q
workers = args.workers
verbose = args.verbose
weighted = args.weighted
directed = args.directed
extend = args.extend
gamma = args.gamma
random_state = args.random_state
mode = args.mode
task = args.task
delimiter = args.delimiter
implicit_ids = args.implicit_ids
if directed and extend:
raise NotImplementedError("Node2vec+ not implemented for directed graph yet.")
if extend and not weighted:
print("NOTE: node2vec+ is equivalent to node2vec for unweighted graphs.")
if task in ["tocsr", "todense"]: # perform conversion then save and exit
g = graph.SparseGraph() if task == "tocsr" else graph.DenseGraph()
g.read_edg(path, weighted, directed, delimiter)
g.save(output)
exit()
pecanpy_mode = getattr(pecanpy, mode, None)
g = pecanpy_mode(p, q, workers, verbose, extend, gamma, random_state)
if path.endswith(".npz"):
g.read_npz(path, weighted, implicit_ids=implicit_ids)
else:
g.read_edg(path, weighted, directed, delimiter)
check_mode(g, args)
return g
@Timer("train embeddings")
def learn_embeddings(args, walks):
"""Learn embeddings by optimizing the Skipgram objective using SGD."""
model = Word2Vec(
walks,
vector_size=args.dimensions,
window=args.window_size,
min_count=0,
sg=1,
workers=args.workers,
epochs=args.epochs,
seed=args.random_state,
)
output_path = args.output
if output_path.endswith(".npz"):
np.savez(output_path, IDs=model.wv.index_to_key, data=model.wv.vectors)
else:
model.wv.save_word2vec_format(output_path)
@Timer("pre-compute transition probabilities")
def preprocess(g):
"""Preprocessing transition probabilities with timer."""
g.preprocess_transition_probs()
@Timer("generate walks")
def simulate_walks(args, g):
"""Simulate random walks with timer."""
return g.simulate_walks(args.num_walks, args.walk_length)
def main():
"""Pipeline for representational learning for all nodes in a graph."""
args = parse_args()
if args.workers == 0:
args.workers = numba.config.NUMBA_DEFAULT_NUM_THREADS
numba.set_num_threads(args.workers)
g = read_graph(args)
preprocess(g)
walks = simulate_walks(args, g)
learn_embeddings(args, walks)
if __name__ == "__main__":
main()
================================================
FILE: src/pecanpy/experimental.py
================================================
"""Experimental features."""
import numpy as np
from numba import njit
from pecanpy.pecanpy import Base
from pecanpy.rw.dense_rw import DenseRWGraph
class Node2vecPlusPlus(Base, DenseRWGraph):
"""Continuous extension of node2vec+ with DenseOTF framework.
In node2vec+ (see `DenseRWGraph.get_extended_normalized_probs`), there is
discontinuous region of the bias-factor (alpha). More specifically, the
transition between the noisy-edge region (w1 < 1 and w2 < 1, where w1 is
the normalized edge weight connecting from current to the previous node,
and w2 is similarly defined for the edge weight connecting from the next
to the previous node), and the "in-out" region (w1 > 1 or w2 > 1).
This continuous extension version of node2vec+, i.e., node2vec++, aims to
provide continuity to those regions by parameterizing the bias-factor as
a continuous function of w1 and w2. The basic idea is to use w2 to control
the interpolation between 1 and 1 / q as before, but in addition, use w1
to parameterize the curvature of the interpolation, so as w1 approaches
zero, the bias-factor goes to min{1, 1 / q} (note that previously, the
bias-factor is set to min{1, 1 / q} whenever w1 falls below one).
"""
def __init__(self, *args, **kwargs):
Base.__init__(self, *args, **kwargs)
def get_move_forward(self):
"""Wrap ``move_forward``."""
data = self.data
nonzero = self.nonzero
p = self.p
q = self.q
noise_thresholds = self.get_noise_thresholds()
get_normalized_probs = self.get_normalized_probs
@njit(nogil=True)
def move_forward(cur_idx, prev_idx=None):
"""Move to next node."""
normalized_probs = get_normalized_probs(
data,
nonzero,
p,
q,
cur_idx,
prev_idx,
noise_thresholds,
)
cdf = np.cumsum(normalized_probs)
choice = np.searchsorted(cdf, np.random.random())
nbrs = np.where(nonzero[cur_idx])[0]
return nbrs[choice]
return move_forward
@staticmethod
@njit(nogil=True)
def get_normalized_probs(
data,
nonzero,
p,
q,
cur_idx,
prev_idx,
noise_threshold_ary,
):
"""Calculate node2vec++ transition probabilities."""
cur_nbrs_ind = nonzero[cur_idx]
cur_nbrs_weight = data[cur_idx].copy()
if prev_idx is not None: # 2nd order biased walks
prev_nbrs_weight = data[prev_idx].copy()
# Note: we assume here the network is undirected, hence the edge
# weight connecting the next to prev is the same as the reverse.
out_ind = cur_nbrs_ind & (prev_nbrs_weight < noise_threshold_ary)
out_ind[prev_idx] = False # exclude previous state from out biases
t = prev_nbrs_weight[out_ind] / noise_threshold_ary[out_ind]
# Determine whether to use '1 - t' or 't' depending on whether q
# is less than or greater than one so that alpha is suppressed to
# min{1, 1 / q} as w1 approaches 0.
t = 1 - t.clip(0, 1) if q < 1 else t.clip(0, 1)
b = cur_nbrs_weight[out_ind] / noise_threshold_ary[out_ind]
# compute out biases
scale = np.abs(1 - 1 / q)
offset = np.minimum(1, 1 / q)
alpha = t * b / (1 + (b - 1)) * scale + offset
cur_nbrs_weight[out_ind] *= alpha # apply out biases
cur_nbrs_weight[prev_idx] /= p # apply the return bias
unnormalized_probs = cur_nbrs_weight[cur_nbrs_ind]
normalized_probs = unnormalized_probs / unnormalized_probs.sum()
return normalized_probs
================================================
FILE: src/pecanpy/graph.py
================================================
"""Lite graph objects used by pecanpy."""
import warnings
import numpy as np
from .typing import AdjMat
from .typing import AdjNonZeroMat
from .typing import CSR
from .typing import Dict
from .typing import Float32Array
from .typing import Iterator
from .typing import List
from .typing import Optional
from .typing import Sequence
from .typing import Tuple
from .typing import Uint32Array
class BaseGraph:
"""Base Graph object.
Handles node id and provides general properties including num_nodes,
and density. The num_edges property is to be specified by the derived
graph objects.
"""
def __init__(self):
self._node_ids: List[str] = []
self._node_idmap: Dict[str, int] = {} # id -> index
@property
def nodes(self) -> List[str]:
"""Return the list of node IDs."""
return self._node_ids
@property
def num_nodes(self) -> int:
"""Return the number of nodes in the graph."""
return len(self.nodes)
@property
def num_edges(self) -> int:
"""Return the number of edges in the graph."""
raise NotImplementedError(
f"{self.__class__.__name__} does not have num_edges, use the "
f"derived classes like SparseGraph and DenseGraph instead.",
)
@property
def density(self) -> float:
"""Return the edge density of the graph."""
return self.num_edges / self.num_nodes / (self.num_nodes - 1)
def set_node_ids(
self,
node_ids: Optional[Sequence[str]],
implicit_ids: bool = False,
num_nodes: Optional[int] = None,
):
"""Update ID list and mapping.
Set _node_ids given the input node IDs and also set the corresponding
_node_idmap based on it, which maps from node ID to the index.
Args:
node_ids (:obj:`list` of :obj:`str`, optional): List of node IDs to
use. If not available, will implicitly set node IDs to the
canonical ordering of nodes with a warning message, which is
suppressed if `implicit_ids` is set to True.
implicit_ids (bool): Implicitly set the node IDs to the canonical
node ordering. If set to False and node IDs are not available,
it will also set implicit node IDs, but with a warning message.
The warning message can be suppressed if `implicit_ids` is set
to True as a confirmation of the behavior.
num_nodes (int, optional): Number of nodes, used when try to set
implicit node IDs.
"""
if (node_ids is not None) and (not implicit_ids):
self._node_ids = list(node_ids)
elif num_nodes is None:
raise ValueError(
"Need to specify `num_nodes` when setting implicit node IDs.",
)
else:
self.set_node_ids(list(map(str, range(num_nodes))))
if not implicit_ids:
warnings.warn(
"WARNING: Implicitly set node IDs to the canonical node "
"ordering due to missing IDs field in the raw CSR npz "
"file. This warning message can be suppressed by setting "
"implicit_ids to True in the read_npz function call, or "
"by setting the --implicit_ids flag in the CLI",
stacklevel=2,
)
self._node_idmap = {j: i for i, j in enumerate(self._node_ids)}
def get_has_nbrs(self):
"""Abstract method to be specified by derived classes."""
raise NotImplementedError
def get_move_forward(self):
"""Abstract method to be specified by derived classes."""
raise NotImplementedError
class AdjlstGraph(BaseGraph):
"""Adjacency list Graph object used for reading/writing edge list files.
Sparse Graph object that stores graph as adjacency list.
Note:
AdjlstGraph is only used for reading/writing edge list files and do not
support random walk computations since Numba njit do not work with
Python data structures like list and dict.
Examples:
Read ``.edg`` file and create ``SparseGraph`` object using
``.read_edg`` method.
>>> from pecanpy.graph import AdjlstGraph
>>>
>>> # initialize SparseGraph object
>>> g = AdjlstGraph()
>>>
>>> # read graph from edgelist
>>> g.read(path_to_edg_file, weighted=True, directed=False)
>>>
>>> indptr, indices, data = g.to_csr() # convert to csr
>>>
>>> dense_mat = g.to_dense() # convert to dense adjacency matrix
>>>
>>> g.save(edg_outpath) # save the graph to an edge list file
"""
def __init__(self):
super().__init__()
self._data: List[Dict[int, float]] = [] # list of nbrs idx -> weights
self._num_edges: int = 0
@property
def edges_iter(self) -> Iterator[Tuple[int, int, float]]:
"""Return an iterator that iterates over all edges."""
for head, head_nbrs in enumerate(self._data):
for tail in sorted(head_nbrs):
yield head, tail, head_nbrs[tail]
@property
def edges(self) -> List[Tuple[int, int, float]]:
"""Return a list of triples (head, tail, weight) representing edges."""
return list(self.edges_iter)
@property
def num_edges(self):
"""Return the number of edges in the graph."""
return self._num_edges
@staticmethod
def _read_edge_line(
edge_line: str,
weighted: bool,
delimiter: str,
) -> Tuple[str, str, float]:
"""Read a line from the edge list file."""
terms = edge_line.strip().split(delimiter)
id1, id2 = terms[0].strip(), terms[1].strip()
weight = 1.0
if weighted:
if len(terms) != 3:
raise ValueError(
f"Expecting three columns in the edge list file for a "
f"weighted graph, got {len(terms)} instead: {edge_line!r}",
)
weight = float(terms[-1])
return id1, id2, weight
@staticmethod
def _is_valid_edge_weight(id1: str, id2: str, weight: float) -> bool:
"""Check if the edge weight is non-negative."""
if weight <= 0:
edg_str = f"w({id1},{id2}) = {weight}"
warnings.warn(
f"Non-positive edge ignored: {edg_str}",
RuntimeWarning,
stacklevel=2,
)
return False
return True
def _check_edge_existence(
self,
id1: str,
id2: str,
idx1: int,
idx2: int,
weight: float,
):
"""Check if an edge exists.
If the edge to be added already exists and the new edge weight is
different from the existing edge weights, print warning message.
"""
if idx2 in self._data[idx1] and self._data[idx1][idx2] != weight:
warnings.warn(
f"edge from {id1} to {id2} exists, with "
f"value of {self._data[idx1][idx2]:.2f}. "
f"Now overwrite to {weight:.2f}.",
RuntimeWarning,
stacklevel=2,
)
def get_node_idx(self, node_id: str) -> int:
"""Get index of the node and create new node when necessary."""
self.add_node(node_id)
return self._node_idmap[node_id]
def add_node(self, node_id: str):
"""Create a new node.
Add a new node to the graph if not already existing, by updating the
ID list, ID map, and the adjacency list data. Otherwise pass through
without further actions.
Note:
Does not raise error even if the node alrealy exists.
"""
if node_id not in self._node_idmap:
self._node_idmap[node_id] = self.num_nodes
self.nodes.append(node_id)
self._data.append({})
def _add_edge_from_idx(self, idx1: int, idx2: int, weight: float):
"""Add an edge based on the head and tail node index with weight."""
self._data[idx1][idx2] = weight
self._num_edges += 1
def add_edge(
self,
id1: str,
id2: str,
weight: float = 1.0,
directed: bool = False,
):
"""Add an edge to the graph.
Note:
Non-positive edges are ignored.
Args:
id1 (str): first node id.
id2 (str): second node id.
weight (float): the edge weight, default is 1.0
directed (bool): whether the edge is directed or not.
"""
if self._is_valid_edge_weight(id1, id2, weight):
idx1, idx2 = map(self.get_node_idx, (id1, id2))
self._check_edge_existence(id1, id2, idx1, idx2, weight)
self._add_edge_from_idx(idx1, idx2, weight)
if not directed:
self._add_edge_from_idx(idx2, idx1, weight)
def read(
self,
path: str,
weighted: bool,
directed: bool,
delimiter: str = "\t",
):
"""Read an edgelist file and create sparse graph.
Note:
Implicitly discard zero weighted edges; if the same edge is defined
multiple times with different edge weights, then the last specified
weight will be used (warning for such behavior will be printed).
Args:
path (str): path to edgelist file, where the file is tab
separated and contains 2 or 3 columns depending on whether
the input graph is weighted, where the the first column
contains the source nodes and the second column contains the
destination nodes that interact with the corresponding source
nodes.
weighted (bool): whether the graph is weighted. If unweighted,
only two columns are expected in the edgelist file, and the
edge weights are implicitly set to 1 for all interactions. If
weighted, a third column encoding the weight of the interaction
in numeric value is expected.
directed (bool): whether the graph is directed, if undirected, the
edge connecting from destination node to source node is created
with same edge weight from source node to destination node.
delimiter (str): delimiter of the edge list file, default is tab.
"""
with open(path, encoding="utf-8") as f:
for edge_line in f:
edge = self._read_edge_line(edge_line, weighted, delimiter)
self.add_edge(*edge, directed)
def save(self, path: str, unweighted: bool = False, delimiter: str = "\t"):
"""Save AdjLst as an ``.edg`` edge list file.
Args:
unweighted (bool): If set to True, only write two columns,
corresponding to the head and tail nodes of the edges, and
ignore the edge weights (default: :obj:`False`).
delimiter (str): Delimiter for separating fields.
"""
with open(path, "w", encoding="utf-8") as f:
for h, t, w in self.edges_iter:
h_id, t_id = self.nodes[h], self.nodes[t]
terms = (h_id, t_id) if unweighted else (h_id, t_id, str(w))
f.write(f"{delimiter.join(terms)}\n")
def to_csr(self) -> CSR:
"""Construct compressed sparse row matrix."""
indptr: Uint32Array = np.zeros(len(self.nodes) + 1, dtype=np.uint32)
for i, row_data in enumerate(self._data):
indptr[i + 1] = indptr[i] + len(row_data)
# last element of indptr indicates the total number of nonzero entries
indices = np.zeros(indptr[-1], dtype=np.uint32)
data = np.zeros(indptr[-1], dtype=np.float32)
for i, nbrs in enumerate(self._data):
if len(nbrs) == 0:
continue
new_indices, new_data = zip(*[(j, nbrs[j]) for j in sorted(nbrs)])
chunk = slice(indptr[i], indptr[i + 1])
indices[chunk] = np.array(new_indices, dtype=np.uint32)
data[chunk] = np.array(new_data, dtype=np.float32)
return indptr, indices, data
def to_dense(self) -> AdjMat:
"""Construct dense adjacency matrix.
Note:
This method does not return a DenseGraph object, but instead returns
a dense adjacency matrix as NDArray, where the index is the same
as that of ``nodes``.
Return:
NDArray: Full adjacency matrix as 2d numpy array.
"""
n_nodes = len(self.nodes)
mat = np.zeros((n_nodes, n_nodes))
for src_node, src_nbrs in enumerate(self._data):
for dst_node in src_nbrs:
mat[src_node, dst_node] = src_nbrs[dst_node]
return mat
@classmethod
def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs):
"""Construct graph using adjacency matrix and node IDs.
Args:
adj_mat(NDArray): 2D numpy array of adjacency matrix
node_ids(:obj:`list` of str): node ID list
Return:
An adjacency graph object representing the adjacency matrix.
"""
g = cls(**kwargs)
# Setup node idmap in the order of node_ids
for node_id in node_ids:
g.add_node(node_id)
# Fill in edge data
for idx1, idx2 in zip(*np.where(adj_mat != 0)):
g._add_edge_from_idx(idx1, idx2, adj_mat[idx1, idx2])
return g
class SparseGraph(BaseGraph):
"""Sparse Graph object that stores graph as adjacency list.
Examples:
Read ``.edg`` file and create ``SparseGraph`` object using
``.read_edg`` method.
>>> from pecanpy.graph import SparseGraph
>>>
>>> # initialize SparseGraph object
>>> g = SparseGraph()
>>>
>>> # read graph from edgelist
>>> g.read_edg(path_to_edg_file, weighted=True, directed=False)
>>>
>>> # save the csr graph as npz file to be used later
>>> g.save(npz_outpath)
"""
def __init__(self):
super().__init__()
self.data: Optional[Float32Array] = None
self.indptr: Optional[Uint32Array] = None
self.indices: Optional[Uint32Array] = None
@property
def num_edges(self) -> int:
"""Return the number of edges in the graph."""
if self.indptr is not None:
return self.indptr[-1]
else:
raise ValueError("Empty graph.")
def read_edg(
self,
path: str,
weighted: bool,
directed: bool,
delimiter: str = "\t",
):
"""Create CSR sparse graph from edge list.
First create ``AdjlstGraph`` by reading the edge list file, and then
convert to ``SparseGraph`` via ``to_csr``.
Args:
path (str): path to edgelist file.
weighted (bool): whether the graph is weighted.
directed (bool): whether the graph is directed.
delimiter (str): delimiter used between node IDs.
"""
g = AdjlstGraph()
g.read(path, weighted, directed, delimiter)
self.set_node_ids(g.nodes)
self.indptr, self.indices, self.data = g.to_csr()
def read_npz(self, path: str, weighted: bool, implicit_ids: bool = False):
"""Directly read a CSR sparse graph.
Note:
To generate a CSR file compatible with PecanPy, first load the graph
as a sparse graph using the SparseGraph (with ``csr=True``).
Then save the sparse graph to a csr file using the ``save``
method from ``SparseGraph``. The saved ``.npz`` file can then
be loaded directly by ``SparseGraph`` later.
Args:
path (str): path to the csr file, which is an npz file with four
arrays with keys 'IDs', 'data', 'indptr', 'indices', which
correspond to the node IDs, the edge weights, the offset array
for each node, and the indices of the edges.
weighted (bool): whether the graph is weighted, if unweighted,
all edge weights will be converted to 1.
directed (bool): not used, for compatibility with ``SparseGraph``.
implicit_ids (bool): Implicitly set the node IDs to the canonical
node ordering from the CSR graph. If unset and the `IDs` field
is not found in the input CSR graph, a warning message will be
displayed on screen. The missing `IDs` field can happen, for
example, when the user uses the CSR graph prepared by
`scipy.sparse.csr`.
"""
raw = np.load(path)
self.indptr = raw["indptr"].astype(np.uint32)
self.indices = raw["indices"].astype(np.uint32)
self.data = raw["data"].astype(np.float32)
if self.data is None:
raise ValueError("Adjacency matrix data not found.")
elif not weighted:
self.data[:] = 1.0 # overwrite edge weights with constant
self.set_node_ids(
raw.get("IDs"),
implicit_ids=implicit_ids,
num_nodes=int(self.indptr.size - 1),
)
def save(self, path: str):
"""Save CSR as ``.csr.npz`` file."""
np.savez(
path,
IDs=self.nodes,
data=self.data,
indptr=self.indptr,
indices=self.indices,
)
@classmethod
def from_adjlst_graph(cls, adjlst_graph, **kwargs):
"""Construct csr graph from adjacency list graph.
Args:
adjlst_graph (:obj:`pecanpy.graph.AdjlstGraph`): Adjacency list
graph to be converted.
"""
g = cls(**kwargs)
g.set_node_ids(adjlst_graph.nodes)
g.indptr, g.indices, g.data = adjlst_graph.to_csr()
return g
@classmethod
def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs):
"""Construct csr graph using adjacency matrix and node IDs.
Note:
Only consider positive valued edges.
Args:
adj_mat(NDArray): 2D numpy array of adjacency matrix
node_ids(:obj:`list` of str): node ID list
"""
g = cls(**kwargs)
g.set_node_ids(node_ids)
adjlst_graph = AdjlstGraph.from_mat(adj_mat, node_ids)
g.indptr, g.indices, g.data = adjlst_graph.to_csr()
return g
class DenseGraph(BaseGraph):
"""Dense Graph object that stores graph as array.
Examples:
Read ``.npz`` files and create ``DenseGraph`` object using ``read_npz``
>>> from pecanpy.graph import DenseGraph
>>>
>>> g = DenseGraph() # initialize DenseGraph object
>>>
>>> g.read_npz(paht_to_npz_file, weighted=True, directed=False)
Read ``.edg`` files and create ``DenseGraph`` object using ``read_edg``
>>> from pecanpy.graph import DenseGraph
>>>
>>> # initialize DenseGraph object
>>> g = DenseGraph()
>>>
>>> # read graph from edgelist
>>> g.read_edg(path_to_edg_file, weighted=True, directed=False)
>>>
>>> # save the dense graph as npz file to be used later
>>> g.save(npz_outpath)
"""
def __init__(self):
super().__init__()
self._data: Optional[AdjMat] = None
self._nonzero: Optional[AdjNonZeroMat] = None
@property
def num_edges(self) -> int:
"""Return the number of edges in the graph."""
if self.nonzero is not None:
return self.nonzero.sum()
else:
raise ValueError("Empty graph.")
@property
def data(self) -> Optional[AdjMat]:
"""Return the adjacency matrix."""
return self._data
@data.setter
def data(self, data: AdjMat):
"""Set adjacency matrix and the corresponding nonzero matrix."""
self._data = data.astype(float)
self._nonzero = np.array(self._data != 0, dtype=bool)
@property
def nonzero(self) -> Optional[AdjNonZeroMat]:
"""Return the nonzero mask for the adjacency matrix."""
return self._nonzero
def read_npz(self, path: str, weighted: bool, implicit_ids: bool = False):
"""Read ``.npz`` file and create dense graph.
Args:
path (str): path to ``.npz`` file.
weighted (bool): whether the graph is weighted, if unweighted,
all none zero weights will be converted to 1.
implicit_ids (bool): Implicitly set the node IDs to the canonical
ordering from the dense adjacency matrix object. If unset and
the `IDs` field is not found in the object, a warning message
will be displayed on screen. This warning message can be
suppressed if `implicit_ids` is set to True as a confirmation
of the behavior.
"""
raw = np.load(path)
self.data = raw["data"]
if not weighted: # overwrite edge weights with constant
self.data = self.nonzero * 1.0 # type: ignore
self.set_node_ids(
raw.get("IDs"),
implicit_ids=implicit_ids,
num_nodes=self.data.shape[0],
)
def read_edg(
self,
path: str,
weighted: bool,
directed: bool,
delimiter: str = "\t",
):
"""Read an edgelist file and construct dense graph."""
g = AdjlstGraph()
g.read(path, weighted, directed, delimiter)
self.set_node_ids(g.nodes)
self.data = g.to_dense()
def save(self, path: str):
"""Save dense graph as ``.dense.npz`` file."""
np.savez(path, data=self.data, IDs=self.nodes)
@classmethod
def from_adjlst_graph(cls, adjlst_graph, **kwargs):
"""Construct dense graph from adjacency list graph.
Args:
adjlst_graph (:obj:`pecanpy.graph.AdjlstGraph`): Adjacency list
graph to be converted.
"""
g = cls(**kwargs)
g.set_node_ids(adjlst_graph.nodes)
g.data = adjlst_graph.to_dense()
return g
@classmethod
def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs):
"""Construct dense graph using adjacency matrix and node IDs.
Args:
adj_mat(NDArray): 2D numpy array of adjacency matrix
node_ids(:obj:`list` of str): node ID list
"""
g = cls(**kwargs)
g.data = adj_mat
g.set_node_ids(node_ids)
return g
================================================
FILE: src/pecanpy/pecanpy.py
================================================
"""Different strategies for generating node2vec walks."""
import numpy as np
from gensim.models import Word2Vec
from numba import njit
from numba import prange
from numba_progress import ProgressBar
from .graph import BaseGraph
from .rw import DenseRWGraph
from .rw import SparseRWGraph
from .typing import Embeddings
from .typing import Float32Array
from .typing import HasNbrs
from .typing import List
from .typing import MoveForward
from .typing import Optional
from .typing import Uint32Array
from .typing import Uint64Array
from .wrappers import Timer
try:
from numba.np.ufunc.parallel import get_thread_id
except ImportError: # numba<0.56
from numba.np.ufunc.parallel import _get_thread_id as get_thread_id
class Base(BaseGraph):
"""Base node2vec object.
This base object provides the skeleton for the node2vec walk algorithm,
which consists of the ``simulate_walks`` method that generate node2vec
random walks. In contrast to the original Python implementation of
node2vec, it is parallelized where each process generates walks
independently.
Args:
p (float): return parameter, value less than 1 encourages returning
back to previous vertex, and discourage for value grater than 1
(default: 1).
q (float): in-out parameter, value less than 1 encourages walks to
go "outward", and value greater than 1 encourage walking within
a localized neighborhood (default: 1)
workers (int): number of threads to be spawned for running node2vec
including walk generation and word2vec embedding (default: 1)
verbose (bool): show progress bar for walk generation.
extend (bool): use node2vec+ extension if set to :obj:`True`
(default: :obj:`False`).
gamma (float): Multiplication factor for the std term of edge
weights added to the average edge weights as the noisy edge
threshold, only used by node2vec+ (default: 0)
random_state (int, optional): Random seed for generating random
walks. Note that to fully ensure reproducibility, use single
thread (i.e., workers=1), and potentially need to set the
Python environment variable ``PYTHONHASHSEED`` to match the
random_state (default: :obj:`None`).
Note:
The ``preprocess_transition_probs`` is required for implementations that
precomputes and stores 2nd order transition probabilities.
Examples:
Generate node2vec embeddings
>>> from pecanpy import pecanpy as node2vec
>>>
>>> # initialize node2vec object, similarly for SparseOTF and DenseOTF
>>> g = node2vec.PreComp(p=0.5, q=1, workers=4, verbose=True)
>>> # alternatively, can specify ``extend=True`` for using node2vec+
>>>
>>> # load graph from edgelist file
>>> g.read_edg(path_to_edg_file, weighted=True, directed=False)
>>> # precompute and save 2nd order transition probs (for PreComp only)
>>> g.preprocess_transition_probs()
>>>
>>> # generate random walks, which could then be used to train w2v
>>> walks = g.simulate_walks(num_walks=10, walk_length=80)
>>>
>>> # alternatively, generate the embeddings directly using ``embed``
>>> emd = g.embed()
"""
def __init__(
self,
p: float = 1,
q: float = 1,
workers: int = 1,
verbose: bool = False,
extend: bool = False,
gamma: float = 0,
random_state: Optional[int] = None,
):
super().__init__()
self.p = p
self.q = q
self.workers = workers # TODO: not doing anything, need to fix.
self.verbose = verbose
self.extend = extend
self.gamma = gamma
self.random_state = random_state
self._preprocessed: bool = False
def _map_walk(self, walk_idx_ary: Uint32Array) -> List[str]:
"""Map walk from node index to node ID.
Note:
The last element in the ``walk_idx_ary`` encodes the effective walk
length. Only walk indices up to the effective walk length are
translated (mapped to node IDs).
"""
end_idx = walk_idx_ary[-1]
walk = [self.nodes[i] for i in walk_idx_ary[:end_idx]]
return walk
def simulate_walks(
self,
num_walks: int,
walk_length: int,
) -> List[List[str]]:
"""Generate walks starting from each nodes ``num_walks`` time.
Note:
This is the master process that spawns worker processes, where the
worker function ``node2vec_walks`` genearte a single random walk
starting from a vertex of the graph.
Args:
num_walks (int): number of walks starting from each node.
walks_length (int): length of walk.
"""
self._preprocess_transition_probs()
nodes = np.array(range(self.num_nodes), dtype=np.uint32)
start_node_idx_ary = np.concatenate([nodes] * num_walks)
tot_num_jobs = start_node_idx_ary.size
random_state = self.random_state
np.random.seed(random_state)
np.random.shuffle(start_node_idx_ary) # for balanced work load
move_forward = self.get_move_forward()
has_nbrs = self.get_has_nbrs()
verbose = self.verbose
# Acquire numba progress proxy for displaying the progress bar
with ProgressBar(total=tot_num_jobs, disable=not verbose) as progress:
walk_idx_mat = self._random_walks(
tot_num_jobs,
walk_length,
random_state,
start_node_idx_ary,
has_nbrs,
move_forward,
progress,
)
# Map node index back to node ID
walks = [self._map_walk(walk_idx_ary) for walk_idx_ary in walk_idx_mat]
return walks
@staticmethod
@njit(parallel=True, nogil=True)
def _random_walks(
tot_num_jobs: int,
walk_length: int,
random_state: Optional[int],
start_node_idx_ary: Uint32Array,
has_nbrs: HasNbrs,
move_forward: MoveForward,
progress_proxy: ProgressBar,
) -> Uint32Array:
"""Simulate a random walk starting from start node."""
# Seed the random number generator
if random_state is not None:
np.random.seed(random_state + get_thread_id())
# use the last entry of each walk index array to keep track of the
# effective walk length
walk_idx_mat: Uint32Array = np.zeros(
(tot_num_jobs, walk_length + 2),
dtype=np.uint32,
)
walk_idx_mat[:, 0] = start_node_idx_ary # initialize seeds
walk_idx_mat[:, -1] = walk_length + 1 # set to full walk length by default
for i in prange(tot_num_jobs):
# initialize first step as normal random walk
start_node_idx = walk_idx_mat[i, 0]
if has_nbrs(start_node_idx):
walk_idx_mat[i, 1] = move_forward(start_node_idx)
else:
walk_idx_mat[i, -1] = 1
continue
# start bias random walk
for j in range(2, walk_length + 1):
cur_idx = walk_idx_mat[i, j - 1]
if has_nbrs(cur_idx):
prev_idx = walk_idx_mat[i, j - 2]
walk_idx_mat[i, j] = move_forward(cur_idx, prev_idx)
else:
walk_idx_mat[i, -1] = j
break
progress_proxy.update(1)
return walk_idx_mat
def setup_get_normalized_probs(self):
"""Transition probability computation setup.
This function performs necessary preprocessing of computing the
average edge weights array, which is used later by the transition
probability computation function ``get_extended_normalized_probs``,
if node2vec+ is used. Otherwise, returns the normal transition function
``get_noramlized_probs`` with a trivial placeholder for average edge
weights array ``noise_thresholds``.
"""
if self.extend: # use n2v+
get_normalized_probs = self.get_extended_normalized_probs
noise_thresholds = self.get_noise_thresholds()
else: # use normal n2v
get_normalized_probs = self.get_normalized_probs
noise_thresholds = None
return get_normalized_probs, noise_thresholds
def preprocess_transition_probs(self):
"""Null default preprocess method."""
pass
def _preprocess_transition_probs(self):
if not self._preprocessed:
self.preprocess_transition_probs()
self._preprocessed = True
def embed(
self,
dim: int = 128,
num_walks: int = 10,
walk_length: int = 80,
window_size: int = 10,
epochs: int = 1,
verbose: bool = False,
) -> Embeddings:
"""Generate embeddings.
This is a shortcut function that combines ``simulate_walks`` with
``Word2Vec`` to generate the node2vec embedding.
Note:
The resulting embeddings are aligned with the graph, i.e., the
index of embeddings is the same as that for the graph.
Args:
dim (int): dimension of the final embedding, default is 128
num_walks (int): number of random walks generated using each node
as the seed node, default is 10
walk_length (int): length of the random walks, default is 80
window_size (int): context window sized for training the
``Word2Vec`` model, default is 10
epochs (int): number of epochs for training ``Word2Vec``, default
is 1
verbose (bool): print time usage for random walk generation and
skip-gram training if set to True
Return:
Embeddings: The embedding matrix, each row is a node embedding
vector. The index is the same as that for the graph.
"""
timed_walk = Timer("generate walks", verbose)(self.simulate_walks)
timed_w2v = Timer("train embeddings", verbose)(Word2Vec)
walks = timed_walk(num_walks, walk_length)
w2v = timed_w2v(
walks,
vector_size=dim,
window=window_size,
sg=1,
min_count=0,
workers=self.workers,
epochs=epochs,
seed=self.random_state,
)
return w2v.wv[self.nodes]
class FirstOrderUnweighted(Base, SparseRWGraph):
"""Directly sample edges for first order random walks."""
def __init__(self, *args, **kwargs):
Base.__init__(self, *args, **kwargs)
def get_move_forward(self):
"""Wrap ``move_forward``."""
indices = self.indices
indptr = self.indptr
@njit(nogil=True)
def move_forward(cur_idx, prev_idx=None):
start, end = indptr[cur_idx], indptr[cur_idx + 1]
return indices[np.random.randint(start, end)]
return move_forward
class PreCompFirstOrder(Base, SparseRWGraph):
"""Precompute transition probabilities for first order random walks."""
def __init__(self, *args, **kwargs):
Base.__init__(self, *args, **kwargs)
self.alias_j = self.alias_q = None
def get_move_forward(self):
"""Wrap ``move_forward``."""
indices = self.indices
indptr = self.indptr
alias_j = self.alias_j
alias_q = self.alias_q
@njit(nogil=True)
def move_forward(cur_idx, prev_idx=None):
start, end = indptr[cur_idx], indptr[cur_idx + 1]
choice = alias_draw(alias_j[start:end], alias_q[start:end])
return indices[indptr[cur_idx] + choice]
return move_forward
def preprocess_transition_probs(self):
"""Precompute and store first order transition probabilities."""
data = self.data
indices = self.indices
indptr = self.indptr
# Retrieve transition probability computation callback function
get_normalized_probs = self.get_normalized_probs_first_order
# Determine the dimensionality of the 1st order transition probs
n_nodes = indptr.size - 1 # number of nodes
n_probs = indptr[-1] # total number of 1st order transition probs
@njit(parallel=True, nogil=True)
def compute_all_transition_probs():
alias_j = np.zeros(n_probs, dtype=np.uint32)
alias_q = np.zeros(n_probs, dtype=np.float32)
for idx in range(n_nodes):
start, end = indptr[idx], indptr[idx + 1]
probs = get_normalized_probs(data, indices, indptr, idx)
alias_j[start:end], alias_q[start:end] = alias_setup(probs)
return alias_j, alias_q
self.alias_j, self.alias_q = compute_all_transition_probs()
class PreComp(Base, SparseRWGraph):
"""Precompute transition probabilities.
This implementation precomputes and stores 2nd order transition
probabilities first and uses read off transition probabilities during the
process of random walk. The graph type used is ``SparseRWGraph``.
Note:
Need to call ``preprocess_transition_probs()`` first before generating
walks.
"""
def __init__(self, *args, **kwargs):
Base.__init__(self, *args, **kwargs)
self.alias_dim: Optional[Uint32Array] = None
self.alias_j: Optional[Uint32Array] = None
self.alias_q: Optional[Float32Array] = None
self.alias_indptr: Optional[Uint64Array] = None
def get_move_forward(self):
"""Wrap ``move_forward``.
This function returns a ``numba.njit`` compiled function that takes
current vertex index (and the previous vertex index if available) and
returns the next vertex index by sampling from a discrete random
distribution based on the transition probabilities that are read off
the precomputed transition probabilities table.
Note:
The returned function is used by the ``simulate_walks`` method.
"""
data = self.data
indices = self.indices
indptr = self.indptr
p = self.p
q = self.q
get_normalized_probs = self.get_normalized_probs
alias_j = self.alias_j
alias_q = self.alias_q
alias_indptr = self.alias_indptr
alias_dim = self.alias_dim
@njit(nogil=True)
def move_forward(cur_idx, prev_idx=None):
"""Move to next node based on transition probabilities."""
if prev_idx is None:
normalized_probs = get_normalized_probs(
data,
indices,
indptr,
p,
q,
cur_idx,
None,
None,
)
cdf = np.cumsum(normalized_probs)
choice = np.searchsorted(cdf, np.random.random())
else:
# Find index of neighbor (previous node) for reading alias
start = indptr[cur_idx]
end = indptr[cur_idx + 1]
nbr_idx = np.searchsorted(indices[start:end], prev_idx)
if indices[start + nbr_idx] != prev_idx:
print("FATAL ERROR! Neighbor not found.")
dim = alias_dim[cur_idx]
start = alias_indptr[cur_idx] + dim * nbr_idx
end = start + dim
choice = alias_draw(alias_j[start:end], alias_q[start:end])
return indices[indptr[cur_idx] + choice]
return move_forward
def preprocess_transition_probs(self):
"""Precompute and store 2nd order transition probabilities.
Each node contains n ** 2 number of 2nd order transition probabilities,
where n is the number of neighbors of that specific node, since one
can pick any one of its neighbors as the previous node and / or the
next node. For each second order transition probability of a node, set
up the alias draw table to be used during random walk.
Note:
Uses uint64 instead of uint32 for tracking alias_indptr to prevent
overflowing since the 2nd order transition probs grows much faster
than the first order transition probs, which is the same as the
total number of edges in the graph.
"""
data = self.data
indices = self.indices
indptr = self.indptr
p = self.p
q = self.q
# Retrieve transition probability computation callback function
get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs()
# Determine the dimensionality of the 2nd order transition probs
n_nodes = self.indptr.size - 1 # number of nodes
n = self.indptr[1:] - self.indptr[:-1] # number of nbrs per node
n2 = np.power(n, 2) # number of 2nd order trans probs per node
# Set the dimensionality of alias probability table
self.alias_dim = alias_dim = n
self.alias_indptr = alias_indptr = np.zeros(self.indptr.size, dtype=np.uint64)
alias_indptr[1:] = np.cumsum(n2)
n_probs = alias_indptr[-1] # total number of 2nd order transition probs
@njit(parallel=True, nogil=True)
def compute_all_transition_probs():
alias_j = np.zeros(n_probs, dtype=np.uint32)
alias_q = np.zeros(n_probs, dtype=np.float32)
for idx in range(n_nodes):
offset = alias_indptr[idx]
dim = alias_dim[idx]
nbrs = indices[indptr[idx] : indptr[idx + 1]]
for nbr_idx in prange(n[idx]):
nbr = nbrs[nbr_idx]
probs = get_normalized_probs(
data,
indices,
indptr,
p,
q,
idx,
nbr,
noise_thresholds,
)
start = offset + dim * nbr_idx
end = start + dim
alias_j[start:end], alias_q[start:end] = alias_setup(probs)
return alias_j, alias_q
self.alias_j, self.alias_q = compute_all_transition_probs()
class SparseOTF(Base, SparseRWGraph):
"""Sparse graph transition on the fly.
This implementation does *NOT* precompute transition probabilities in advance
but instead calculates them on-the-fly during the process of random walk.
The graph type used is ``SparseRWGraph``.
"""
def __init__(self, *args, **kwargs):
Base.__init__(self, *args, **kwargs)
def get_move_forward(self):
"""Wrap ``move_forward``.
This function returns a ``numba.njit`` compiled function that takes
current vertex index (and the previous vertex index if available) and
returns the next vertex index by sampling from a discrete random
distribution based on the transition probabilities that are calculated
on-the-fly.
Note:
The returned function is used by the ``simulate_walks`` method.
"""
data = self.data
indices = self.indices
indptr = self.indptr
p = self.p
q = self.q
get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs()
@njit(nogil=True)
def move_forward(cur_idx, prev_idx=None):
"""Move to next node."""
normalized_probs = get_normalized_probs(
data,
indices,
indptr,
p,
q,
cur_idx,
prev_idx,
noise_thresholds,
)
cdf = np.cumsum(normalized_probs)
choice = np.searchsorted(cdf, np.random.random())
return indices[indptr[cur_idx] + choice]
return move_forward
class DenseOTF(Base, DenseRWGraph):
"""Dense graph transition on the fly.
This implementation does *NOT* precompute transition probabilities in advance
but instead calculates them on-the-fly during the process of random walk.
The graph type used is ``DenseRWGraph``.
"""
def __init__(self, *args, **kwargs):
Base.__init__(self, *args, **kwargs)
def get_move_forward(self):
"""Wrap ``move_forward``.
This function returns a ``numba.njit`` compiled function that takes
current vertex index (and the previous vertex index if available) and
returns the next vertex index by sampling from a discrete random
distribution based on the transition probabilities that are calculated
on-the-fly.
Note:
The returned function is used by the ``simulate_walks`` method.
"""
data = self.data
nonzero = self.nonzero
p = self.p
q = self.q
get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs()
@njit(nogil=True)
def move_forward(cur_idx, prev_idx=None):
"""Move to next node."""
normalized_probs = get_normalized_probs(
data,
nonzero,
p,
q,
cur_idx,
prev_idx,
noise_thresholds,
)
cdf = np.cumsum(normalized_probs)
choice = np.searchsorted(cdf, np.random.random())
nbrs = np.where(nonzero[cur_idx])[0]
return nbrs[choice]
return move_forward
@njit(nogil=True)
def alias_setup(probs):
"""Construct alias lookup table.
This code is modified from the blog post here:
https://lips.cs.princeton.edu/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
, where you can find more details about how the method works. In general,
the alias method improves the time complexity of sampling from a discrete
random distribution to O(1) if the alias table is setup in advance.
Args:
probs (list(float32)): normalized transition probabilities array, could
be in either list or NDArray, of float32 values.
"""
k = probs.size
q = np.zeros(k, dtype=np.float32)
j = np.zeros(k, dtype=np.uint32)
smaller = np.zeros(k, dtype=np.uint32)
larger = np.zeros(k, dtype=np.uint32)
smaller_ptr = 0
larger_ptr = 0
for kk in range(k):
q[kk] = k * probs[kk]
if q[kk] < 1.0:
smaller[smaller_ptr] = kk
smaller_ptr += 1
else:
larger[larger_ptr] = kk
larger_ptr += 1
while (smaller_ptr > 0) & (larger_ptr > 0):
smaller_ptr -= 1
small = smaller[smaller_ptr]
larger_ptr -= 1
large = larger[larger_ptr]
j[small] = large
q[large] = q[large] + q[small] - 1.0
if q[large] < 1.0:
smaller[smaller_ptr] = large
smaller_ptr += 1
else:
larger[larger_ptr] = large
larger_ptr += 1
return j, q
@njit(nogil=True)
def alias_draw(j, q):
"""Draw sample from a non-uniform discrete distribution using alias sampling."""
k = j.size
kk = np.random.randint(k)
if np.random.rand() < q[kk]:
return kk
else:
return j[kk]
================================================
FILE: src/pecanpy/rw/__init__.py
================================================
"""Graph objects equipped with random walk transition functions."""
from .dense_rw import DenseRWGraph
from .sparse_rw import SparseRWGraph
__all__ = ["DenseRWGraph", "SparseRWGraph"]
================================================
FILE: src/pecanpy/rw/dense_rw.py
================================================
"""Dense Graph object equipped with random walk computation."""
import numpy as np
from numba import njit
from ..graph import DenseGraph
class DenseRWGraph(DenseGraph):
"""Dense Graph object equipped with random walk computation."""
def get_noise_thresholds(self):
"""Compute average edge weights."""
noise_threshold_ary = np.zeros(self.num_nodes, dtype=np.float32)
for i in range(self.num_nodes):
weights = self.data[i, self.nonzero[i]]
noise_threshold_ary[i] = weights.mean() + self.gamma * weights.std()
noise_threshold_ary = np.maximum(noise_threshold_ary, 0)
return noise_threshold_ary
def get_has_nbrs(self):
"""Wrap ``has_nbrs``."""
nonzero = self.nonzero
@njit(nogil=True)
def has_nbrs(idx):
for j in range(nonzero.shape[1]):
if nonzero[idx, j]:
return True
return False
return has_nbrs
@staticmethod
@njit(nogil=True)
def get_normalized_probs(
data,
nonzero,
p,
q,
cur_idx,
prev_idx,
noise_threshold_ary,
):
"""Calculate node2vec transition probabilities.
Calculate 2nd order transition probabilities by first finding the
neighbors of the current state that are not reachable from the previous
state, and divide the corresponding edge weights by the in-out parameter
``q``. Then divide the edge weight from previous state by the return
parameter ``p``. Finally, the transition probabilities are computed by
normalizing the biased edge weights.
Note:
If ``prev_idx`` present, calculate 2nd order biased transition,
otherwise calculate 1st order transition.
"""
nbrs_ind = nonzero[cur_idx]
unnormalized_probs = data[cur_idx].copy()
if prev_idx is not None: # 2nd order biased walks
non_com_nbr = np.logical_and(nbrs_ind, ~nonzero[prev_idx])
non_com_nbr[prev_idx] = False # exclude previous state from out biases
unnormalized_probs[non_com_nbr] /= q # apply out biases
unnormalized_probs[prev_idx] /= p # apply the return bias
unnormalized_probs = unnormalized_probs[nbrs_ind]
normalized_probs = unnormalized_probs / unnormalized_probs.sum()
return normalized_probs
@staticmethod
@njit(nogil=True)
def get_extended_normalized_probs(
data,
nonzero,
p,
q,
cur_idx,
prev_idx,
noise_threshold_ary,
):
"""Calculate node2vec+ transition probabilities."""
cur_nbrs_ind = nonzero[cur_idx]
unnormalized_probs = data[cur_idx].copy()
if prev_idx is not None: # 2nd order biased walks
prev_nbrs_weight = data[prev_idx].copy()
# Note: we assume here the network is undirected, hence the edge
# weight connecting the next to prev is the same as the reverse.
out_ind = cur_nbrs_ind & (prev_nbrs_weight < noise_threshold_ary)
out_ind[prev_idx] = False # exclude previous state from out biases
# print("CURRENT: ", cur_idx)
# print("INOUT: ", np.where(out_ind)[0])
# print("NUM INOUT: ", out_ind.sum(), "\n")
t = prev_nbrs_weight[out_ind] / noise_threshold_ary[out_ind]
# optional nonlinear parameterization
# b = 1; t = b * t / (1 - (b - 1) * t)
# compute out biases
alpha = 1 / q + (1 - 1 / q) * t
# suppress noisy edges
alpha[
unnormalized_probs[out_ind] < noise_threshold_ary[cur_idx]
] = np.minimum(1, 1 / q)
unnormalized_probs[out_ind] *= alpha # apply out biases
unnormalized_probs[prev_idx] /= p # apply the return bias
unnormalized_probs = unnormalized_probs[cur_nbrs_ind]
normalized_probs = unnormalized_probs / unnormalized_probs.sum()
return normalized_probs
================================================
FILE: src/pecanpy/rw/sparse_rw.py
================================================
"""Sparse Graph equipped with random walk computation."""
import numpy as np
from numba import boolean
from numba import njit
from ..graph import SparseGraph
class SparseRWGraph(SparseGraph):
"""Sparse Graph equipped with random walk computation."""
def get_has_nbrs(self):
"""Wrap ``has_nbrs``."""
indptr = self.indptr
@njit(nogil=True)
def has_nbrs(idx):
return indptr[idx] != indptr[idx + 1]
return has_nbrs
def get_noise_thresholds(self):
"""Compute average edge weights."""
data = self.data
indptr = self.indptr
noise_threshold_ary = np.zeros(self.num_nodes, dtype=np.float32)
for i in range(self.num_nodes):
noise_threshold_ary[i] = (
data[indptr[i] : indptr[i + 1]].mean()
+ self.gamma * data[indptr[i] : indptr[i + 1]].std()
)
noise_threshold_ary = np.maximum(noise_threshold_ary, 0)
return noise_threshold_ary
@staticmethod
@njit(nogil=True)
def get_normalized_probs_first_order(data, indices, indptr, cur_idx):
"""Calculate first order transition probabilities.
Note:
This function does NOT check whether p = q = 1, which is the
required setup for first order random walk. Need to check before
calling this function.
"""
_, unnormalized_probs = get_nbrs(indptr, indices, data, cur_idx)
return unnormalized_probs / unnormalized_probs.sum()
@staticmethod
@njit(nogil=True)
def get_normalized_probs(
data,
indices,
indptr,
p,
q,
cur_idx,
prev_idx,
noise_threshold_ary,
):
"""Calculate node2vec transition probabilities.
Calculate 2nd order transition probabilities by first finding the
neighbors of the current state that are not reachable from the previous
state, and divide the corresponding edge weights by the in-out parameter
``q``. Then divide the edge weight from previous state by the return
parameter ``p``. Finally, the transition probabilities are computed by
normalizing the biased edge weights.
Note:
If ``prev_idx`` present, calculate 2nd order biased transition,
otherwise calculate 1st order transition.
"""
nbrs_idx, unnormalized_probs = get_nbrs(indptr, indices, data, cur_idx)
if prev_idx is not None: # 2nd order biased walk
prev_ptr = np.where(nbrs_idx == prev_idx)[0]
src_nbrs_idx, src_nbrs_wts = get_nbrs(indptr, indices, data, prev_idx)
# Neighbors of current but not previous
non_com_nbr = isnotin(nbrs_idx, src_nbrs_idx)
non_com_nbr[prev_ptr] = False # exclude prev state from out biases
unnormalized_probs[non_com_nbr] /= q # apply out biases
unnormalized_probs[prev_ptr] /= p # apply the return bias
normalized_probs = unnormalized_probs / unnormalized_probs.sum()
return normalized_probs
@staticmethod
@njit(nogil=True)
def get_extended_normalized_probs(
data,
indices,
indptr,
p,
q,
cur_idx,
prev_idx,
noise_threshold_ary,
):
"""Calculate node2vec+ transition probabilities."""
nbrs_idx, unnormalized_probs = get_nbrs(indptr, indices, data, cur_idx)
if prev_idx is not None: # 2nd order biased walk
prev_ptr = np.where(nbrs_idx == prev_idx)[0]
src_nbrs_idx, src_nbrs_wts = get_nbrs(indptr, indices, data, prev_idx)
out_ind, t = isnotin_extended(
nbrs_idx,
src_nbrs_idx,
src_nbrs_wts,
noise_threshold_ary,
) # determine out edges
out_ind[prev_ptr] = False # exclude prevstate from out biases
# compute out biases
alpha = 1 / q + (1 - 1 / q) * t[out_ind]
# suppress noisy edges
alpha[
unnormalized_probs[out_ind] < noise_threshold_ary[cur_idx]
] = np.minimum(1, 1 / q)
unnormalized_probs[out_ind] *= alpha # apply out biases
unnormalized_probs[prev_ptr] /= p # apply the return bias
normalized_probs = unnormalized_probs / unnormalized_probs.sum()
return normalized_probs
@njit(nogil=True)
def get_nbrs(indptr, indices, data, idx):
"""Return neighbor indices and weights of a specific node index."""
start_idx, end_idx = indptr[idx], indptr[idx + 1]
nbrs_idx = indices[start_idx:end_idx]
nbrs_wts = data[start_idx:end_idx].copy()
return nbrs_idx, nbrs_wts
@njit(nogil=True)
def isnotin(ptr_ary1, ptr_ary2):
"""Find node2vec out edges.
The node2vec out edges are determined by non-common neighbors. This function
finds out neighbors of node1 that are not neighbors of node2, by picking out
values in ``ptr_ary1`` but not in ``ptr_ary2``, which correspond to the
neighbor pointers for the current state and the previous state, resp.
Note:
This function does not remove the index of the previous state. Instead,
the index of the previous state will be removed once the indicator is
returned to the ``get_normalized_probs``.
Args:
ptr_ary1 (Uint32Array): array of pointers to
the neighbors of the current state
ptr_ary2 (Uint32Array): array of pointers to
the neighbors of the previous state
Returns:
Indicator of whether a neighbor of the current state is considered as
an "out edge"
Example:
The values in the two neighbor pointer arrays are sorted ascendingly.
The main idea is to scan through ``ptr_ary1`` and compare the values in
``ptr_ary2``. In this way, at most one pass per array is needed to find
out the non-common neighbor pointers instead of a nested loop (for each
element in ``ptr_ary1``, compare against every element in``ptr_ary2``),
which is much slower. Checkout the following example for more intuition.
The ``*`` above ``ptr_ary1`` and ``ptr_ary2`` indicate the indices
``idx1`` and ``idx2``, respectively, which keep track of the scanning
progress.
>>> ptr_ary1 = [1, 2, 5]
>>> ptr_ary2 = [1, 5]
>>>
>>> # iteration1: indicator = [False, True, True]
>>> *
>>> [1, 2, 5]
>>> *
>>> [1, 5]
>>>
>>> # iteration2: indicator = [False, True, True]
>>> *
>>> [1, 2, 5]
>>> *
>>> [1, 5]
>>>
>>> # iteration3: indicator = [False, True, False]
>>> *
>>> [1, 2, 5]
>>> *
>>> [1, 5]
>>>
>>> # end of loop
"""
indicator = np.ones(ptr_ary1.size, dtype=boolean)
idx2 = 0
for idx1 in range(ptr_ary1.size):
if idx2 == ptr_ary2.size: # end of ary2
break
ptr1 = ptr_ary1[idx1]
ptr2 = ptr_ary2[idx2]
if ptr1 < ptr2:
continue
elif ptr1 == ptr2: # found a matching value
indicator[idx1] = False
idx2 += 1
elif ptr1 > ptr2:
# sweep through ptr_ary2 until ptr2 catch up on ptr1
for j in range(idx2, ptr_ary2.size):
ptr2 = ptr_ary2[j]
if ptr2 == ptr1:
indicator[idx1] = False
idx2 = j + 1
break
elif ptr2 > ptr1:
idx2 = j
break
return indicator
@njit(nogil=True)
def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, noise_thresholds):
"""Find node2vec+ out edges.
The node2vec+ out edges are determined by considering the edge weights
connecting node2 (the potential next state) to the previous state. Unlike
node2vec, which only considers neighbors of current state that are not
neighbors of the previous state, node2vec+ also considers neighbors of
the previous state as out edges if the edge weight is below average.
Args:
ptr_ary1 (Uint32Array): array of pointers to the neighbors of the
current state
ptr_ary2 (Uint32Array): array of pointers to the neighbors of the
previous state
wts_ary2 (Float32Array): array of edge weights of the previous state
noise_thresholds (Float32Array): array of noisy edge threshold computed
based on the average and the std of the edge weights of each node
Return:
Indicator of whether a neighbor of the current state is considered as
an "out edge", with the corresponding parameters used to fine tune
the out biases
"""
indicator = np.ones(ptr_ary1.size, dtype=boolean)
t = np.zeros(ptr_ary1.size, dtype=np.float32)
idx2 = 0
for idx1 in range(ptr_ary1.size):
if idx2 >= ptr_ary2.size: # end of ary2
break
ptr1 = ptr_ary1[idx1]
ptr2 = ptr_ary2[idx2]
if ptr1 < ptr2:
continue
elif ptr1 == ptr2: # found a matching value
# If connection is not loose, identify as an in-edge
if wts_ary2[idx2] >= noise_thresholds[ptr2]:
indicator[idx1] = False
else:
t[idx1] = wts_ary2[idx2] / noise_thresholds[ptr2]
idx2 += 1
elif ptr1 > ptr2:
# Sweep through ptr_ary2 until ptr2 catch up on ptr1
for j in range(idx2 + 1, ptr_ary2.size):
ptr2 = ptr_ary2[j]
if ptr2 == ptr1:
if wts_ary2[j] >= noise_thresholds[ptr2]:
indicator[idx1] = False
else:
t[idx1] = wts_ary2[j] / noise_thresholds[ptr2]
idx2 = j + 1
break
elif ptr2 > ptr1:
idx2 = j
break
return indicator, t
================================================
FILE: src/pecanpy/typing.py
================================================
"""Type annotations."""
from typing import Any
from typing import Callable
from typing import Dict
from typing import Iterator
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from nptyping import Bool
from nptyping import Float32
from nptyping import NDArray
from nptyping import Shape
from nptyping import UInt32
from nptyping import UInt64
from typing_extensions import TypeAlias
# Callbacks ###################################################################
HasNbrs = Callable[[UInt32], bool]
MoveForward = Callable[..., UInt32]
# Numpy array types ###########################################################
# issue with type alias (https://stackoverflow.com/questions/62073473)
Embeddings: TypeAlias = NDArray[Shape["*, *"], Float32]
AdjMat: TypeAlias = NDArray[Shape["*, *"], Any]
AdjNonZeroMat: TypeAlias = NDArray[Shape["*, *"], Bool]
Uint32Array: TypeAlias = NDArray[Shape["*"], UInt32]
Uint64Array: TypeAlias = NDArray[Shape["*"], UInt64]
Float32Array: TypeAlias = NDArray[Shape["*"], Float32]
CSR = Tuple[Uint32Array, Uint32Array, Float32Array]
__all__ = [
"AdjMat",
"AdjNonZeroMat",
"Any",
"CSR",
"Callable",
"Dict",
"Embeddings",
"Float32Array",
"HasNbrs",
"Iterator",
"List",
"MoveForward",
"NDArray",
"Optional",
"Sequence",
"Tuple",
"Uint32Array",
]
================================================
FILE: src/pecanpy/wrappers.py
================================================
"""Wrappers used by pecanpy."""
import time
class Timer:
"""Timer for logging runtime of function."""
def __init__(self, name, verbose=True):
self.name = name
self.verbose = verbose
def __call__(self, func):
"""Call timer decorator."""
def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
duration = time.time() - start
hrs = int(duration // 3600)
mins = int(duration % 3600 // 60)
secs = duration % 60
print(f"Took {hrs:02d}:{mins:02d}:{secs:05.2f} to {self.name}")
return result
return wrapper if self.verbose else func
================================================
FILE: test/test_cli.py
================================================
import os
import os.path as osp
import shutil
import subprocess
import tempfile
import unittest
from unittest.mock import patch
from numba import set_num_threads
from parameterized import parameterized
from pecanpy import cli
set_num_threads(1)
DATA_DIR = osp.abspath(osp.join(__file__, osp.pardir, osp.pardir, "demo"))
EDG_FP = osp.join(DATA_DIR, "karate.edg")
TMP_DATA_DIR = tempfile.mkdtemp()
CSR_FP = osp.join(TMP_DATA_DIR, "karate.csr.npz")
DENSE_FP = osp.join(TMP_DATA_DIR, "karate.dense.npz")
COM = ["pecanpy", "--input", EDG_FP, "--output"]
SETTINGS = [
("FirstOrderUnweighted",),
("PreCompFirstOrder",),
("PreComp",),
("SparseOTF",),
("DenseOTF",),
]
class TestCli(unittest.TestCase):
@classmethod
def setUpClass(cls):
subprocess.run(COM + [CSR_FP, "--task", "tocsr"])
subprocess.run(COM + [DENSE_FP, "--task", "todense"])
@classmethod
def tearDownClass(cls):
shutil.rmtree(TMP_DATA_DIR)
@patch(
"argparse._sys.argv",
["pecanpy", "--input", "", "--output", os.devnull],
)
def setUp(self):
self.args = cli.parse_args()
self.args.workers = 1
self.args.dimensions = 8
self.args.walk_length = 10
self.args.num_walks = 2
self.g = self.walks = None
def tearDown(self):
del self.args
del self.g
del self.walks
def execute(self, mode, input_file, p=1, q=1):
self.args.mode = mode
self.args.input = input_file
self.args.p = p
self.args.q = q
self.g = cli.read_graph(self.args)
cli.preprocess(self.g)
self.walks = cli.simulate_walks(self.args, self.g)
cli.learn_embeddings(self.args, self.walks)
def test_firstorderunweighted_catch(self):
for p, q in (2, 1), (1, 0.1), (0.1, 0.1):
with self.subTest(p=p, q=q):
with self.assertRaises(ValueError):
self.execute("FirstOrderUnweighted", EDG_FP, p, q)
def test_precompfirstorder_catch(self):
for p, q in (2, 1), (1, 0.1), (0.1, 0.1):
with self.subTest(p=p, q=q):
with self.assertRaises(ValueError):
self.execute("PreCompFirstOrder", EDG_FP, p, q)
@parameterized.expand(SETTINGS)
def test_from_edg(self, name):
self.execute(name, EDG_FP)
@parameterized.expand(SETTINGS)
def test_from_npz(self, name):
self.execute(name, DENSE_FP if name == "DenseOTF" else CSR_FP)
if __name__ == "__main__":
unittest.main()
================================================
FILE: test/test_graph.py
================================================
import os
import os.path as osp
import shutil
import tempfile
import unittest
from itertools import chain
import numpy as np
import pytest
import scipy.sparse
from pecanpy.graph import AdjlstGraph
from pecanpy.graph import BaseGraph
from pecanpy.graph import DenseGraph
from pecanpy.graph import SparseGraph
MAT = np.array(
[
[0, 1, 1],
[1, 0, 0],
[1, 0, 0],
],
dtype=float,
)
INDPTR = np.array([0, 2, 3, 4], dtype=np.uint32)
INDICES = np.array([1, 2, 0, 0], dtype=np.uint32)
DATA = np.array([1.0, 1.0, 1.0, 1.0], dtype=np.float32)
ADJLST = [
{1: 1.0, 2: 1.0},
{0: 1.0},
{0: 1.0},
]
IDS = ["a", "b", "c"]
IDMAP = {"a": 0, "b": 1, "c": 2}
# This test ensures that the node IDs (from edges) are loaded in the correct order
# even if they appear to have been loaded in an incorrect order.
MAT2 = np.array(
[
[0, 1, 0, 0, 0],
[1, 0, 1, 1, 0],
[0, 1, 0, 0, 0],
[0, 1, 0, 0, 1],
[0, 0, 0, 1, 0],
],
dtype=float,
)
INDPTR2 = np.array([0, 1, 4, 5, 7, 8], dtype=np.uint32)
INDICES2 = np.array([1, 0, 2, 3, 1, 1, 4, 3], dtype=np.uint32)
DATA2 = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
ADJLST2 = [
{1: 1.0},
{0: 1.0, 2: 1.0, 3: 1.0},
{1: 1.0},
{1: 1.0, 4: 1.0},
{3: 1.0},
]
IDS2 = ["a", "b", "c", "d", "e"]
IDMAP2 = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4}
# Test asymmetric directed graph loading with node that has no out-going edge
MAT3 = np.array(
[
[0, 1, 0, 0],
[1, 0, 0, 1],
[0, 0, 0, 0],
[0, 1, 1, 0],
],
)
INDPTR3 = np.array([0, 1, 3, 3, 5], dtype=np.uint32)
INDICES3 = np.array([1, 0, 3, 1, 2], dtype=np.uint32)
DATA3 = np.array([1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)
ADJLST3 = [
{1: 1.0},
{0: 1.0, 3: 1.0},
{},
{1: 1.0, 2: 1.0},
]
IDS3 = ["a", "b", "c", "d"]
IDMAP3 = {"a": 0, "b": 1, "c": 2, "d": 3}
class TestBaseGraph(unittest.TestCase):
def setUp(self):
self.g = BaseGraph()
self.g.set_node_ids(IDS)
def test_set_node_ids(self):
self.assertEqual(self.g.nodes, IDS)
self.assertEqual(self.g._node_idmap, IDMAP)
def test_properties(self):
self.assertEqual(self.g.num_nodes, 3)
with self.assertRaises(NotImplementedError):
self.assertEqual(self.g.num_edges, 4)
with self.assertRaises(NotImplementedError):
self.assertEqual(self.g.density, 2 / 3)
class TestAdjlstGraph(unittest.TestCase):
def setUp(self):
self.g1 = AdjlstGraph.from_mat(MAT, IDS)
self.g2 = AdjlstGraph.from_mat(MAT2, IDS2)
self.g3 = AdjlstGraph.from_mat(MAT3, IDS3)
def tearDown(self):
del self.g1
del self.g2
del self.g3
def test_from_mat(self):
self.assertEqual(self.g1._data, ADJLST)
self.assertEqual(self.g1.nodes, IDS)
self.assertEqual(self.g2._data, ADJLST2)
self.assertEqual(self.g2.nodes, IDS2)
self.assertEqual(self.g3._data, ADJLST3)
self.assertEqual(self.g3.nodes, IDS3)
def test_properties(self):
self.assertEqual(self.g1.num_nodes, 3)
self.assertEqual(self.g1.num_edges, 4)
self.assertEqual(self.g1.density, 2 / 3)
self.assertEqual(self.g2.num_nodes, 5)
self.assertEqual(self.g2.num_edges, 8)
self.assertEqual(self.g2.density, 2 / 5)
self.assertEqual(self.g3.num_nodes, 4)
self.assertEqual(self.g3.num_edges, 5)
self.assertEqual(self.g3.density, 5 / 12)
def test_edges(self):
self.assertEqual(
list(self.g1.edges),
[
(0, 1, 1),
(0, 2, 1),
(1, 0, 1),
(2, 0, 1),
],
)
self.assertEqual(
list(self.g2.edges),
[
(0, 1, 1),
(1, 0, 1),
(1, 2, 1),
(1, 3, 1),
(2, 1, 1),
(3, 1, 1),
(3, 4, 1),
(4, 3, 1),
],
)
def test_save(self):
expected_results = {
(False, "\t"): [
"a\tb\t1.0\n",
"a\tc\t1.0\n",
"b\ta\t1.0\n",
"c\ta\t1.0\n",
],
(True, "\t"): [
"a\tb\n",
"a\tc\n",
"b\ta\n",
"c\ta\n",
],
(False, ","): [
"a,b,1.0\n",
"a,c,1.0\n",
"b,a,1.0\n",
"c,a,1.0\n",
],
(True, ","): [
"a,b\n",
"a,c\n",
"b,a\n",
"c,a\n",
],
}
tmpdir = tempfile.mkdtemp()
tmppath = os.path.join(tmpdir, "test.edg")
for unweighted in True, False:
for delimiter in ["\t", ","]:
self.g1.save(
tmppath,
unweighted=unweighted,
delimiter=delimiter,
)
with open(tmppath) as f:
expected_result = expected_results[(unweighted, delimiter)]
for line, expected_line in zip(f, expected_result):
self.assertEqual(line, expected_line)
shutil.rmtree(tmpdir)
class TestSparseGraph(unittest.TestCase):
def tearDown(self):
del self.g1
del self.g2
del self.g3
def validate(self):
self.assertTrue(np.all(self.g1.indptr == INDPTR))
self.assertTrue(np.all(self.g1.indices == INDICES))
self.assertTrue(np.all(self.g1.data == DATA))
self.assertEqual(self.g1.nodes, IDS)
self.assertEqual(self.g1.num_nodes, 3)
self.assertEqual(self.g1.num_edges, 4)
self.assertEqual(self.g1.density, 2 / 3)
self.assertTrue(np.all(self.g2.indptr == INDPTR2))
self.assertTrue(np.all(self.g2.indices == INDICES2))
self.assertTrue(np.all(self.g2.data == DATA2))
self.assertEqual(self.g2.nodes, IDS2)
self.assertEqual(self.g2.num_nodes, 5)
self.assertEqual(self.g2.num_edges, 8)
self.assertEqual(self.g2.density, 2 / 5)
self.assertTrue(np.all(self.g3.indptr == INDPTR3))
self.assertTrue(np.all(self.g3.indices == INDICES3))
self.assertTrue(np.all(self.g3.data == DATA3))
self.assertEqual(self.g3.nodes, IDS3)
self.assertEqual(self.g3.num_nodes, 4)
self.assertEqual(self.g3.num_edges, 5)
self.assertEqual(self.g3.density, 5 / 12)
def test_from_mat(self):
self.g1 = SparseGraph.from_mat(MAT, IDS)
self.g2 = SparseGraph.from_mat(MAT2, IDS2)
self.g3 = SparseGraph.from_mat(MAT3, IDS3)
self.validate()
def test_from_adjlst_graph(self):
self.g1 = SparseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT, IDS))
self.g2 = SparseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT2, IDS2))
self.g3 = SparseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT3, IDS3))
self.validate()
class TestDenseGraph(unittest.TestCase):
def tearDown(self):
del self.g1
del self.g2
def validate(self):
self.assertTrue(np.all(self.g1.data == MAT))
self.assertEqual(self.g1.nodes, IDS)
self.assertEqual(self.g1.num_nodes, 3)
self.assertEqual(self.g1.num_edges, 4)
self.assertEqual(self.g1.density, 2 / 3)
self.assertTrue(np.all(self.g2.data == MAT2))
self.assertEqual(self.g2.nodes, IDS2)
self.assertEqual(self.g2.num_nodes, 5)
self.assertEqual(self.g2.num_edges, 8)
self.assertEqual(self.g2.density, 2 / 5)
self.assertTrue(np.all(self.g3.data == MAT3))
self.assertEqual(self.g3.nodes, IDS3)
self.assertEqual(self.g3.num_nodes, 4)
self.assertEqual(self.g3.num_edges, 5)
self.assertEqual(self.g3.density, 5 / 12)
def test_from_mat(self):
self.g1 = DenseGraph.from_mat(MAT, IDS)
self.g2 = DenseGraph.from_mat(MAT2, IDS2)
self.g3 = DenseGraph.from_mat(MAT3, IDS3)
self.validate()
def test_from_adjlst_graph(self):
self.g1 = DenseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT, IDS))
self.g2 = DenseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT2, IDS2))
self.g3 = DenseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT3, IDS3))
self.validate()
@pytest.mark.usefixtures("karate_graph_converted")
def test_csr_from_scipy(tmpdir):
tmp_karate_csr_path = osp.join(tmpdir, "karate.csr.npz")
print(f"Temporary karate CSR will be saved under {tmp_karate_csr_path}")
# Save karate CSR using scipy.sparse.csr
edgelist = np.loadtxt(pytest.KARATE_ORIG_PATH).astype(int) - 1
edgelist = np.vstack((edgelist, edgelist[:, [1, 0]])).T # to undirected
num_nodes = edgelist.max() + 1
csr = scipy.sparse.csr_matrix(
(np.ones(edgelist.shape[1]), ([edgelist[0], edgelist[1]])),
shape=(num_nodes, num_nodes),
)
scipy.sparse.save_npz(tmp_karate_csr_path, csr)
# Load scipy CSR and compare with PecanPy CSR
scipy_csr_graph, pecanpy_graph = SparseGraph(), AdjlstGraph()
scipy_csr_graph.read_npz(tmp_karate_csr_path, weighted=False)
pecanpy_graph.read(pytest.KARATE_ORIG_PATH, weighted=False, directed=False)
# Assert graph size (number of nodes)
assert scipy_csr_graph.num_nodes == pecanpy_graph.num_nodes
# Assert neighborhood sizes
scipy_csr_nbhd_sizes = scipy_csr_graph.indptr[1:] - scipy_csr_graph.indptr[:-1]
for scipy_node_idx in range(scipy_csr_graph.num_nodes):
pecanpy_node_idx = pecanpy_graph.get_node_idx(str(scipy_node_idx + 1))
assert scipy_csr_nbhd_sizes[scipy_node_idx] == len(
pecanpy_graph._data[pecanpy_node_idx],
)
@pytest.mark.usefixtures("karate_graph_converted")
@pytest.mark.parametrize("implicit_ids", [True, False])
@pytest.mark.parametrize("graph_factory", [SparseGraph, DenseGraph])
def test_implicit_ids(implicit_ids, graph_factory):
graph_path = (
pytest.KARATE_CSR_PATH
if graph_factory == SparseGraph
else pytest.KARATE_DENSE_PATH
)
ref_ids = pytest.KARATE_IMPLICIT_IDS if implicit_ids else pytest.KARATE_NODE_IDS
g = graph_factory()
g.read_npz(graph_path, weighted=False, implicit_ids=implicit_ids)
assert sorted(g.nodes) == sorted(ref_ids)
@pytest.fixture(scope="module")
def karate_graph_converted(pytestconfig, tmpdir_factory):
tmpdir = tmpdir_factory.mktemp("test_graph")
pytest.KARATE_ORIG_PATH = osp.join(pytestconfig.rootpath, "demo/karate.edg")
pytest.KARATE_CSR_PATH = osp.join(tmpdir, "karate.csr.npz")
pytest.KARATE_DENSE_PATH = osp.join(tmpdir, "karate.dense.npz")
# Load karate node ids
karate_edgelist = np.loadtxt(pytest.KARATE_ORIG_PATH, dtype=str).tolist()
pytest.KARATE_NODE_IDS = list(set(chain.from_iterable(karate_edgelist)))
pytest.KARATE_IMPLICIT_IDS = list(map(str, range(len(pytest.KARATE_NODE_IDS))))
# Load karate graph and save csr.npz and dense.npz
g = AdjlstGraph()
g.read(pytest.KARATE_ORIG_PATH, weighted=False, directed=False)
SparseGraph.from_adjlst_graph(g).save(pytest.KARATE_CSR_PATH)
DenseGraph.from_adjlst_graph(g).save(pytest.KARATE_DENSE_PATH)
del g
yield
if __name__ == "__main__":
unittest.main()
================================================
FILE: test/test_pecanpy.py
================================================
import os.path as osp
import unittest
from numba import set_num_threads
from parameterized import parameterized
from pecanpy import graph
from pecanpy import pecanpy
set_num_threads(1)
DATA_DIR = osp.abspath(osp.join(__file__, osp.pardir, osp.pardir, "demo"))
EDG_FP = osp.join(DATA_DIR, "karate.edg")
SETTINGS = [
("SparseOTF", pecanpy.SparseOTF),
("DenseOTF", pecanpy.DenseOTF),
("PreComp", pecanpy.PreComp),
("PreCompFirstOrder", pecanpy.PreCompFirstOrder),
("FirstOrderUnweighted", pecanpy.FirstOrderUnweighted),
]
class TestPecanPy(unittest.TestCase):
@classmethod
def setUpClass(self):
g = graph.DenseGraph()
g.read_edg(EDG_FP, weighted=False, directed=False)
self.mat = g.data
self.ids = g.nodes
@parameterized.expand(SETTINGS)
def test_from_mat(self, name, mode):
with self.subTest(name):
g = mode.from_mat(self.mat, self.ids, p=1, q=1)
g.embed()
@parameterized.expand(SETTINGS)
def test_from_edg(self, name, mode):
with self.subTest(name):
g = mode(p=1, q=1)
g.read_edg(EDG_FP, weighted=False, directed=False)
g.embed()
if __name__ == "__main__":
unittest.main()
================================================
FILE: test/test_walk.py
================================================
import unittest
import numpy as np
from numba import set_num_threads
from parameterized import parameterized
from pecanpy import pecanpy
set_num_threads(1)
MAT = np.array(
[
[0, 1, 0, 0, 0],
[1, 0, 1, 0, 0],
[0, 1, 0, 1, 1],
[0, 0, 1, 0, 1],
[0, 0, 1, 1, 0],
],
)
IDS = ["a", "b", "c", "d", "e"]
WALKS = {
"FirstOrderUnweighted": [
["c", "b", "c", "d"],
["d", "c", "d", "e"],
["e", "d", "c", "b"],
["e", "d", "c", "b"],
["b", "a", "b", "a"],
["b", "a", "b", "c"],
["c", "e", "d", "e"],
["d", "c", "b", "c"],
["a", "b", "c", "d"],
["a", "b", "c", "b"],
],
"PreCompFirstOrder": [
["c", "d", "e", "d"],
["d", "c", "d", "e"],
["e", "d", "c", "e"],
["e", "d", "e", "c"],
["b", "c", "e", "c"],
["b", "c", "d", "c"],
["c", "d", "e", "d"],
["d", "c", "e", "d"],
["a", "b", "a", "b"],
["a", "b", "c", "e"],
],
"PreComp": [
["c", "d", "e", "d"],
["d", "c", "d", "e"],
["e", "d", "c", "e"],
["e", "d", "e", "c"],
["b", "c", "e", "c"],
["b", "c", "d", "c"],
["c", "d", "e", "d"],
["d", "c", "e", "d"],
["a", "b", "a", "b"],
["a", "b", "c", "e"],
],
"SparseOTF": [
["c", "d", "e", "d"],
["d", "e", "c", "d"],
["e", "c", "e", "d"],
["e", "c", "e", "d"],
["b", "c", "e", "c"],
["b", "a", "b", "c"],
["c", "e", "d", "e"],
["d", "e", "c", "e"],
["a", "b", "c", "b"],
["a", "b", "c", "d"],
],
"DenseOTF": [
["c", "d", "e", "d"],
["d", "e", "c", "d"],
["e", "c", "e", "d"],
["e", "c", "e", "d"],
["b", "c", "e", "c"],
["b", "a", "b", "c"],
["c", "e", "d", "e"],
["d", "e", "c", "e"],
["a", "b", "c", "b"],
["a", "b", "c", "d"],
],
}
class TestWalk(unittest.TestCase):
@parameterized.expand(
[
("FirstOrderUnweighted", pecanpy.FirstOrderUnweighted),
("PreCompFirstOrder", pecanpy.PreComp),
("PreComp", pecanpy.PreComp),
("SparseOTF", pecanpy.SparseOTF),
("DenseOTF", pecanpy.DenseOTF),
],
)
def test_first_order_unweighted(self, name, mode):
graph = mode.from_mat(MAT, IDS, p=1, q=1, random_state=0)
walks = graph.simulate_walks(2, 3)
self.assertEqual(walks, WALKS[name])
print(walks)
if __name__ == "__main__":
unittest.main()
================================================
FILE: tox.ini
================================================
[tox]
minversion = 3.8.0
envlist =
python3.8
python3.9
python3.10
python3.11
flake8
mypy
isolated_build = true
[gh-actions]
python =
3.8: python3.8, flake8
3.9: python3.9
3.10: python3.10
3.11: python3.11
[testenv]
setenv =
PYTHONPATH = {toxinidir}
deps =
-r{toxinidir}/requirements.txt
.[dev]
commands =
pytest --basetemp={envtmpdir} test/
[testenv:mypy]
skip_install = true
deps =
mypy
numpy
commands = mypy src/pecanpy
[testenv:flake8]
skip_install = true
deps =
flake8
# flake8-bandit
flake8-builtins
flake8-bugbear
flake8-colors
flake8-commas
flake8-comprehensions
flake8-docstrings
flake8-import-order
flake8-use-fstring
pep8-naming
pydocstyle
commands =
flake8 src/pecanpy/
description = Run the flake8 tool with several plugins (bandit, docstrings, import order, pep8 naming).
[flake8]
max-line-length = 88
extend-ignore =
A005
E203
# current limitation of nptyping https://github.com/ramonhagenaars/nptyping/issues/63
F722
# init param docstring in class docstring
D107
exclude =
.tox,
.git,
__pycache__,
build,
dist,
*.pyc,
*.egg-info,
.cache,
.eggs
import-order-style = pycharm
application-import-names =
pybel
bel_resources
tests
gitextract_5ev1jrt4/ ├── .bumpversion.cfg ├── .github/ │ ├── dependabot.yml │ └── workflows/ │ ├── release.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── LICENSE ├── README.md ├── demo/ │ ├── karate.edg │ ├── reproducibility.sh │ └── run_pecanpy ├── docs/ │ ├── Makefile │ ├── requirements.txt │ └── source/ │ ├── conf.py │ ├── index.rst │ └── pecanpy.rst ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py ├── src/ │ └── pecanpy/ │ ├── __init__.py │ ├── cli.py │ ├── experimental.py │ ├── graph.py │ ├── pecanpy.py │ ├── rw/ │ │ ├── __init__.py │ │ ├── dense_rw.py │ │ └── sparse_rw.py │ ├── typing.py │ └── wrappers.py ├── test/ │ ├── test_cli.py │ ├── test_graph.py │ ├── test_pecanpy.py │ └── test_walk.py └── tox.ini
SYMBOL INDEX (141 symbols across 11 files)
FILE: src/pecanpy/cli.py
function parse_args (line 27) | def parse_args():
function check_mode (line 179) | def check_mode(g, args):
function read_graph (line 258) | def read_graph(args):
function learn_embeddings (line 308) | def learn_embeddings(args, walks):
function preprocess (line 329) | def preprocess(g):
function simulate_walks (line 335) | def simulate_walks(args, g):
function main (line 340) | def main():
FILE: src/pecanpy/experimental.py
class Node2vecPlusPlus (line 8) | class Node2vecPlusPlus(Base, DenseRWGraph):
method __init__ (line 28) | def __init__(self, *args, **kwargs):
method get_move_forward (line 31) | def get_move_forward(self):
method get_normalized_probs (line 63) | def get_normalized_probs(
FILE: src/pecanpy/graph.py
class BaseGraph (line 19) | class BaseGraph:
method __init__ (line 28) | def __init__(self):
method nodes (line 33) | def nodes(self) -> List[str]:
method num_nodes (line 38) | def num_nodes(self) -> int:
method num_edges (line 43) | def num_edges(self) -> int:
method density (line 51) | def density(self) -> float:
method set_node_ids (line 55) | def set_node_ids(
method get_has_nbrs (line 99) | def get_has_nbrs(self):
method get_move_forward (line 103) | def get_move_forward(self):
class AdjlstGraph (line 108) | class AdjlstGraph(BaseGraph):
method __init__ (line 138) | def __init__(self):
method edges_iter (line 144) | def edges_iter(self) -> Iterator[Tuple[int, int, float]]:
method edges (line 151) | def edges(self) -> List[Tuple[int, int, float]]:
method num_edges (line 156) | def num_edges(self):
method _read_edge_line (line 161) | def _read_edge_line(
method _is_valid_edge_weight (line 182) | def _is_valid_edge_weight(id1: str, id2: str, weight: float) -> bool:
method _check_edge_existence (line 194) | def _check_edge_existence(
method get_node_idx (line 217) | def get_node_idx(self, node_id: str) -> int:
method add_node (line 222) | def add_node(self, node_id: str):
method _add_edge_from_idx (line 238) | def _add_edge_from_idx(self, idx1: int, idx2: int, weight: float):
method add_edge (line 243) | def add_edge(
method read (line 270) | def read(
method save (line 307) | def save(self, path: str, unweighted: bool = False, delimiter: str = "...
method to_csr (line 323) | def to_csr(self) -> CSR:
method to_dense (line 343) | def to_dense(self) -> AdjMat:
method from_mat (line 365) | def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs):
class SparseGraph (line 389) | class SparseGraph(BaseGraph):
method __init__ (line 409) | def __init__(self):
method num_edges (line 416) | def num_edges(self) -> int:
method read_edg (line 423) | def read_edg(
method read_npz (line 447) | def read_npz(self, path: str, weighted: bool, implicit_ids: bool = Fal...
method save (line 488) | def save(self, path: str):
method from_adjlst_graph (line 499) | def from_adjlst_graph(cls, adjlst_graph, **kwargs):
method from_mat (line 513) | def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs):
class DenseGraph (line 531) | class DenseGraph(BaseGraph):
method __init__ (line 558) | def __init__(self):
method num_edges (line 564) | def num_edges(self) -> int:
method data (line 572) | def data(self) -> Optional[AdjMat]:
method data (line 577) | def data(self, data: AdjMat):
method nonzero (line 583) | def nonzero(self) -> Optional[AdjNonZeroMat]:
method read_npz (line 587) | def read_npz(self, path: str, weighted: bool, implicit_ids: bool = Fal...
method read_edg (line 613) | def read_edg(
method save (line 627) | def save(self, path: str):
method from_adjlst_graph (line 632) | def from_adjlst_graph(cls, adjlst_graph, **kwargs):
method from_mat (line 646) | def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs):
FILE: src/pecanpy/pecanpy.py
class Base (line 27) | class Base(BaseGraph):
method __init__ (line 83) | def __init__(
method _map_walk (line 103) | def _map_walk(self, walk_idx_ary: Uint32Array) -> List[str]:
method simulate_walks (line 116) | def simulate_walks(
method _random_walks (line 166) | def _random_walks(
method setup_get_normalized_probs (line 212) | def setup_get_normalized_probs(self):
method preprocess_transition_probs (line 231) | def preprocess_transition_probs(self):
method _preprocess_transition_probs (line 235) | def _preprocess_transition_probs(self):
method embed (line 240) | def embed(
class FirstOrderUnweighted (line 293) | class FirstOrderUnweighted(Base, SparseRWGraph):
method __init__ (line 296) | def __init__(self, *args, **kwargs):
method get_move_forward (line 299) | def get_move_forward(self):
class PreCompFirstOrder (line 312) | class PreCompFirstOrder(Base, SparseRWGraph):
method __init__ (line 315) | def __init__(self, *args, **kwargs):
method get_move_forward (line 319) | def get_move_forward(self):
method preprocess_transition_probs (line 336) | def preprocess_transition_probs(self):
class PreComp (line 364) | class PreComp(Base, SparseRWGraph):
method __init__ (line 377) | def __init__(self, *args, **kwargs):
method get_move_forward (line 384) | def get_move_forward(self):
method preprocess_transition_probs (line 442) | def preprocess_transition_probs(self):
class SparseOTF (line 510) | class SparseOTF(Base, SparseRWGraph):
method __init__ (line 519) | def __init__(self, *args, **kwargs):
method get_move_forward (line 522) | def get_move_forward(self):
class DenseOTF (line 564) | class DenseOTF(Base, DenseRWGraph):
method __init__ (line 573) | def __init__(self, *args, **kwargs):
method get_move_forward (line 576) | def get_move_forward(self):
function alias_setup (line 618) | def alias_setup(probs):
function alias_draw (line 669) | def alias_draw(j, q):
FILE: src/pecanpy/rw/dense_rw.py
class DenseRWGraph (line 8) | class DenseRWGraph(DenseGraph):
method get_noise_thresholds (line 11) | def get_noise_thresholds(self):
method get_has_nbrs (line 21) | def get_has_nbrs(self):
method get_normalized_probs (line 36) | def get_normalized_probs(
method get_extended_normalized_probs (line 76) | def get_extended_normalized_probs(
FILE: src/pecanpy/rw/sparse_rw.py
class SparseRWGraph (line 9) | class SparseRWGraph(SparseGraph):
method get_has_nbrs (line 12) | def get_has_nbrs(self):
method get_noise_thresholds (line 22) | def get_noise_thresholds(self):
method get_normalized_probs_first_order (line 39) | def get_normalized_probs_first_order(data, indices, indptr, cur_idx):
method get_normalized_probs (line 53) | def get_normalized_probs(
method get_extended_normalized_probs (line 95) | def get_extended_normalized_probs(
function get_nbrs (line 134) | def get_nbrs(indptr, indices, data, idx):
function isnotin (line 143) | def isnotin(ptr_ary1, ptr_ary2):
function isnotin_extended (line 234) | def isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, noise_thresholds):
FILE: src/pecanpy/wrappers.py
class Timer (line 5) | class Timer:
method __init__ (line 8) | def __init__(self, name, verbose=True):
method __call__ (line 12) | def __call__(self, func):
FILE: test/test_cli.py
class TestCli (line 32) | class TestCli(unittest.TestCase):
method setUpClass (line 34) | def setUpClass(cls):
method tearDownClass (line 39) | def tearDownClass(cls):
method setUp (line 46) | def setUp(self):
method tearDown (line 54) | def tearDown(self):
method execute (line 59) | def execute(self, mode, input_file, p=1, q=1):
method test_firstorderunweighted_catch (line 69) | def test_firstorderunweighted_catch(self):
method test_precompfirstorder_catch (line 75) | def test_precompfirstorder_catch(self):
method test_from_edg (line 82) | def test_from_edg(self, name):
method test_from_npz (line 86) | def test_from_npz(self, name):
FILE: test/test_graph.py
class TestBaseGraph (line 82) | class TestBaseGraph(unittest.TestCase):
method setUp (line 83) | def setUp(self):
method test_set_node_ids (line 87) | def test_set_node_ids(self):
method test_properties (line 91) | def test_properties(self):
class TestAdjlstGraph (line 99) | class TestAdjlstGraph(unittest.TestCase):
method setUp (line 100) | def setUp(self):
method tearDown (line 105) | def tearDown(self):
method test_from_mat (line 110) | def test_from_mat(self):
method test_properties (line 120) | def test_properties(self):
method test_edges (line 133) | def test_edges(self):
method test_save (line 158) | def test_save(self):
class TestSparseGraph (line 205) | class TestSparseGraph(unittest.TestCase):
method tearDown (line 206) | def tearDown(self):
method validate (line 211) | def validate(self):
method test_from_mat (line 236) | def test_from_mat(self):
method test_from_adjlst_graph (line 242) | def test_from_adjlst_graph(self):
class TestDenseGraph (line 249) | class TestDenseGraph(unittest.TestCase):
method tearDown (line 250) | def tearDown(self):
method validate (line 254) | def validate(self):
method test_from_mat (line 273) | def test_from_mat(self):
method test_from_adjlst_graph (line 279) | def test_from_adjlst_graph(self):
function test_csr_from_scipy (line 287) | def test_csr_from_scipy(tmpdir):
function test_implicit_ids (line 321) | def test_implicit_ids(implicit_ids, graph_factory):
function karate_graph_converted (line 336) | def karate_graph_converted(pytestconfig, tmpdir_factory):
FILE: test/test_pecanpy.py
class TestPecanPy (line 22) | class TestPecanPy(unittest.TestCase):
method setUpClass (line 24) | def setUpClass(self):
method test_from_mat (line 31) | def test_from_mat(self, name, mode):
method test_from_edg (line 37) | def test_from_edg(self, name, mode):
FILE: test/test_walk.py
class TestWalk (line 85) | class TestWalk(unittest.TestCase):
method test_first_order_unweighted (line 95) | def test_first_order_unweighted(self, name, mode):
Condensed preview — 36 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (126K chars).
[
{
"path": ".bumpversion.cfg",
"chars": 758,
"preview": "[bumpversion]\ncurrent_version = 2.0.10-dev\ntag = False\ncommit = True\nmessage = bump version: {current_version} -> {new_v"
},
{
"path": ".github/dependabot.yml",
"chars": 666,
"preview": "# To get started with Dependabot version updates, you'll need to specify which\n# package ecosystems to update and where "
},
{
"path": ".github/workflows/release.yml",
"chars": 529,
"preview": "name: Release Package\n\non:\n release:\n types: [created]\n\njobs:\n deploy:\n runs-on: ubuntu-latest\n\n steps:\n -"
},
{
"path": ".github/workflows/tests.yml",
"chars": 588,
"preview": "name: Tests\n\non:\n - push\n - pull_request\n\njobs:\n test:\n runs-on: ${{ matrix.os }}\n strategy:\n matrix:\n "
},
{
"path": ".gitignore",
"chars": 1238,
"preview": "# vim buffer\n*.swp\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Di"
},
{
"path": ".pre-commit-config.yaml",
"chars": 666,
"preview": "repos:\n - repo: https://github.com/pre-commit/pre-commit-hooks\n rev: v4.5.0\n hooks:\n - id: trailing-whitespa"
},
{
"path": ".readthedocs.yml",
"chars": 253,
"preview": "# Read the Docs configuration file\n\nversion: 2\n\nbuild:\n os: ubuntu-22.04\n tools:\n python: \"3.8\"\n\nsphinx:\n config"
},
{
"path": "LICENSE",
"chars": 1552,
"preview": "BSD 3-Clause License\n\nCopyright (c) 2020-2021, Krishnan Laboratory, Michigan State University.\nAll rights reserved.\n\nRed"
},
{
"path": "README.md",
"chars": 9770,
"preview": "[](https://doi.org/10.5281/zenodo.6386437)\n[![Documentati"
},
{
"path": "demo/karate.edg",
"chars": 407,
"preview": "1\t32\n1\t22\n1\t20\n1\t18\n1\t14\n1\t13\n1\t12\n1\t11\n1\t9\n1\t8\n1\t7\n1\t6\n1\t5\n1\t4\n1\t3\n1\t2\n2\t31\n2\t22\n2\t20\n2\t18\n2\t14\n2\t8\n2\t4\n2\t3\n3\t14\n3\t9\n3\t"
},
{
"path": "demo/reproducibility.sh",
"chars": 445,
"preview": "#!/bin/bash --login\n# reproducibility.sh\n# Test the reproducibility of PecanPy between runs.\n\nsource ~/.bashrc\n\nrs=100\ne"
},
{
"path": "demo/run_pecanpy",
"chars": 666,
"preview": "#!/bin/bash\ncd $(dirname $(realpath $0))\ncd ../\n\nset -v\n\n# run with PreComp mode (default)\npecanpy --input demo/karate.e"
},
{
"path": "docs/Makefile",
"chars": 638,
"preview": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the "
},
{
"path": "docs/requirements.txt",
"chars": 24,
"preview": "sphinx\nsphinx_rtd_theme\n"
},
{
"path": "docs/source/conf.py",
"chars": 2451,
"preview": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common op"
},
{
"path": "docs/source/index.rst",
"chars": 114,
"preview": "Welcome to PecanPy's documentation\n==================================\n\n.. toctree::\n :maxdepth: 2\n\n pecanpy\n"
},
{
"path": "docs/source/pecanpy.rst",
"chars": 430,
"preview": "PecanPy package\n===============\n\nCommand line interface\n----------------------\n\n.. automodule:: pecanpy.cli\n :members:"
},
{
"path": "pyproject.toml",
"chars": 209,
"preview": "[build-system]\nrequires = [\"setuptools>=42.0\", \"wheel\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[tool.mypy]\nignore_miss"
},
{
"path": "requirements.txt",
"chars": 186,
"preview": "gensim==4.3.2\nnptyping==2.5.0\nnumba-progress==1.1.0\nnumba==0.58.1\nnumpy==1.23.2\nscipy<1.13 # triu import issue (https:/"
},
{
"path": "setup.cfg",
"chars": 1470,
"preview": "[metadata]\nname = pecanpy\nversion = 2.0.10-dev\nauthor = Remy Liu\nauthor_email = liurenmi@msu.edu\ndescription = A paralle"
},
{
"path": "setup.py",
"chars": 89,
"preview": "\"\"\"Setup module.\"\"\"\nimport setuptools\n\nif __name__ == \"__main__\":\n setuptools.setup()\n"
},
{
"path": "src/pecanpy/__init__.py",
"chars": 163,
"preview": "\"\"\"PecanPy: parallelized, efficient, and accelerated node2vec.\"\"\"\nfrom . import graph\nfrom . import pecanpy\n\nversion = \""
},
{
"path": "src/pecanpy/cli.py",
"chars": 9444,
"preview": "\"\"\"Command line utility for PecanPy.\n\nThis is the command line interface for the ``pecanpy`` package.\n\nExamples:\n Run"
},
{
"path": "src/pecanpy/experimental.py",
"chars": 3846,
"preview": "\"\"\"Experimental features.\"\"\"\nimport numpy as np\nfrom numba import njit\nfrom pecanpy.pecanpy import Base\nfrom pecanpy.rw."
},
{
"path": "src/pecanpy/graph.py",
"chars": 22851,
"preview": "\"\"\"Lite graph objects used by pecanpy.\"\"\"\nimport warnings\n\nimport numpy as np\n\nfrom .typing import AdjMat\nfrom .typing i"
},
{
"path": "src/pecanpy/pecanpy.py",
"chars": 23654,
"preview": "\"\"\"Different strategies for generating node2vec walks.\"\"\"\nimport numpy as np\nfrom gensim.models import Word2Vec\nfrom num"
},
{
"path": "src/pecanpy/rw/__init__.py",
"chars": 185,
"preview": "\"\"\"Graph objects equipped with random walk transition functions.\"\"\"\nfrom .dense_rw import DenseRWGraph\nfrom .sparse_rw i"
},
{
"path": "src/pecanpy/rw/dense_rw.py",
"chars": 4089,
"preview": "\"\"\"Dense Graph object equipped with random walk computation.\"\"\"\nimport numpy as np\nfrom numba import njit\n\nfrom ..graph "
},
{
"path": "src/pecanpy/rw/sparse_rw.py",
"chars": 10109,
"preview": "\"\"\"Sparse Graph equipped with random walk computation.\"\"\"\nimport numpy as np\nfrom numba import boolean\nfrom numba import"
},
{
"path": "src/pecanpy/typing.py",
"chars": 1396,
"preview": "\"\"\"Type annotations.\"\"\"\nfrom typing import Any\nfrom typing import Callable\nfrom typing import Dict\nfrom typing import It"
},
{
"path": "src/pecanpy/wrappers.py",
"chars": 706,
"preview": "\"\"\"Wrappers used by pecanpy.\"\"\"\nimport time\n\n\nclass Timer:\n \"\"\"Timer for logging runtime of function.\"\"\"\n\n def __i"
},
{
"path": "test/test_cli.py",
"chars": 2555,
"preview": "import os\nimport os.path as osp\nimport shutil\nimport subprocess\nimport tempfile\nimport unittest\nfrom unittest.mock impor"
},
{
"path": "test/test_graph.py",
"chars": 11464,
"preview": "import os\nimport os.path as osp\nimport shutil\nimport tempfile\nimport unittest\nfrom itertools import chain\n\nimport numpy "
},
{
"path": "test/test_pecanpy.py",
"chars": 1237,
"preview": "import os.path as osp\nimport unittest\n\nfrom numba import set_num_threads\nfrom parameterized import parameterized\nfrom pe"
},
{
"path": "test/test_walk.py",
"chars": 2628,
"preview": "import unittest\n\nimport numpy as np\nfrom numba import set_num_threads\nfrom parameterized import parameterized\nfrom pecan"
},
{
"path": "tox.ini",
"chars": 1335,
"preview": "[tox]\nminversion = 3.8.0\nenvlist =\n python3.8\n python3.9\n python3.10\n python3.11\n flake8\n mypy\nisolate"
}
]
About this extraction
This page contains the full source code of the krishnanlab/PecanPy GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 36 files (116.0 KB), approximately 31.1k tokens, and a symbol index with 141 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.