[
  {
    "path": ".bumpversion.cfg",
    "content": "[bumpversion]\ncurrent_version = 2.0.10-dev\ntag = False\ncommit = True\nmessage = bump version: {current_version} -> {new_version}\nparse = (?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)([-](?P<release>(dev|stable)+)(?P<build>\\d*))?\nserialize = \n\t{major}.{minor}.{patch}-{release}{build}\n\t{major}.{minor}.{patch}-{release}\n\t{major}.{minor}.{patch}\n\n[bumpversion:part:release]\noptional_value = stable\nvalues = \n\tdev\n\tstable\n\n[bumpversion:file:setup.cfg]\nsearch = version = {current_version}\nreplace = version = {new_version}\n\n[bumpversion:file:src/pecanpy/__init__.py]\nsearch = __version__ = \"{current_version}\"\nreplace = version = \"{new_version}\"\n\n[bumpversion:file:docs/source/conf.py]\nsearch = release = \"{current_version}\"\nreplace = release = \"{new_version}\"\n"
  },
  {
    "path": ".github/dependabot.yml",
    "content": "# To get started with Dependabot version updates, you'll need to specify which\n# package ecosystems to update and where the package manifests are located.\n# Please see the documentation for all configuration options:\n# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates\n\nversion: 2\nupdates:\n  - package-ecosystem: \"pip\" # See documentation for possible values\n    directory: \"/\" # Location of package manifests\n    schedule:\n      interval: \"daily\"\n    ignore:\n      - dependency-name: \"numpy\"\n        versions: [\"1.22.x\"]  # Numba 0.55.1 do not support numpy 1.22.x yet https://github.com/numba/numba/issues/7754\n"
  },
  {
    "path": ".github/workflows/release.yml",
    "content": "name: Release Package\n\non:\n  release:\n    types: [created]\n\njobs:\n  deploy:\n    runs-on: ubuntu-latest\n\n    steps:\n    - uses: actions/checkout@v4\n    - uses: actions/setup-python@v5\n\n    - name: Install dependencies\n      run: |\n        python -m pip install --upgrade pip\n        pip install setuptools wheel twine\n\n    - name: Build and publish\n      env:\n        TWINE_USERNAME: __token__\n        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}\n      run: |\n        python setup.py sdist bdist_wheel\n        twine upload dist/*\n"
  },
  {
    "path": ".github/workflows/tests.yml",
    "content": "name: Tests\n\non:\n  - push\n  - pull_request\n\njobs:\n  test:\n    runs-on: ${{ matrix.os }}\n    strategy:\n      matrix:\n        os: [ubuntu-latest, windows-latest]\n        python-version: ['3.8', '3.9', '3.10', '3.11']\n\n    steps:\n    - uses: actions/checkout@v4\n\n    - name: Set up Python ${{ matrix.python-version }}\n      uses: actions/setup-python@v5\n      with:\n        python-version: ${{ matrix.python-version }}\n\n    - name: Install dependencies\n      run: |\n        python -m pip install --upgrade pip\n        pip install tox tox-gh-actions\n\n    - name: Test with tox\n      run: tox\n"
  },
  {
    "path": ".gitignore",
    "content": "# vim buffer\n*.swp\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n\n# IDEA\n.idea/\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v4.5.0\n    hooks:\n      - id: trailing-whitespace\n        exclude: .bumpversion.cfg\n      - id: end-of-file-fixer\n\n  - repo: https://github.com/asottile/reorder-python-imports\n    rev: v3.12.0\n    hooks:\n      - id: reorder-python-imports\n        args: [\"--py38-plus\"]\n\n  - repo: https://github.com/asottile/add-trailing-comma\n    rev: v3.1.0\n    hooks:\n      - id: add-trailing-comma\n\n  - repo: https://github.com/asottile/pyupgrade\n    rev: v3.15.0\n    hooks:\n      - id: pyupgrade\n\n  - repo: https://github.com/psf/black\n    rev: 23.12.1\n    hooks:\n      - id: black\n        args: [--safe]\n"
  },
  {
    "path": ".readthedocs.yml",
    "content": "# Read the Docs configuration file\n\nversion: 2\n\nbuild:\n  os: ubuntu-22.04\n  tools:\n    python: \"3.8\"\n\nsphinx:\n    configuration: docs/source/conf.py\n\npython:\n   install:\n      - requirements: docs/requirements.txt\n      - requirements: requirements.txt\n"
  },
  {
    "path": "LICENSE",
    "content": "BSD 3-Clause License\n\nCopyright (c) 2020-2021, Krishnan Laboratory, Michigan State University.\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above copyright notice, this\n  list of conditions and the following disclaimer.\n\n* Redistributions in binary form must reproduce the above copyright notice,\n  this list of conditions and the following disclaimer in the documentation\n  and/or other materials provided with the distribution.\n\n* Neither the name of the copyright holder nor the names of its\n  contributors may be used to endorse or promote products derived from\n  this software without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
  },
  {
    "path": "README.md",
    "content": "[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6386437.svg)](https://doi.org/10.5281/zenodo.6386437)\n[![Documentation Status](https://readthedocs.org/projects/pecanpy/badge/?version=latest)](https://pecanpy.readthedocs.io/en/latest/?badge=latest)\n[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)\n[![Tests](https://github.com/krishnanlab/PecanPy/actions/workflows/tests.yml/badge.svg)](https://github.com/krishnanlab/PecanPy/actions/workflows/tests.yml)\n\n# PecanPy: A parallelized, efficient, and accelerated _node2vec(+)_ in Python\n\nLearning low-dimensional representations (embeddings) of nodes in large graphs is key to applying machine learning on massive biological networks. _Node2vec_ is the most widely used method for node embedding. PecanPy is a fast, parallelized, memory efficient, and cache optimized Python implementation of [_node2vec_](https://github.com/aditya-grover/node2vec). It uses cache-optimized compact graph data structures and precomputing/parallelization to result in fast, high-quality node embeddings for biological networks of all sizes and densities. Detailed source code documentation can be found [here](https://pecanpy.readthedocs.io/).\n\nThe details of implementation and the optimizations, along with benchmarks, are described in the application note [_PecanPy: a fast, efficient and parallelized Python implementation of node2vec_](https://doi.org/10.1093/bioinformatics/btab202), which is published in _Bioinformatics_. The benchmarking results presented in the preprint can be reproduced using the test scripts provided in the companion [benchmarks repo](https://github.com/krishnanlab/PecanPy_benchmarks).\n\n**v2 update**: PecanPy is now equipped with _node2vec+_, which is a natural extension of _node2vec_ and handles weighted graph more effectively. For more information, see [*Accurately Modeling Biased Random Walks on Weighted Graphs Using Node2vec+*](https://arxiv.org/abs/2109.08031). The datasets and test scripts for reproducing the presented results are available in the [node2vec+ benchmarks repo](https://github.com/krishnanlab/node2vecplus_benchmarks).\n\n## Installation\n\nInstall from the latest release with:\n\n```bash\n$ pip install pecanpy\n```\n\nInstall latest version (unreleassed) in development mode with:\n\n```bash\n$ git clone https://github.com/krishnanlab/pecanpy.git\n$ cd pecanpy\n$ pip install -e .\n```\n\nwhere `-e` means \"editable\" mode so you don't have to reinstall every time you make changes.\n\nPecanPy installs a command line utility `pecanpy` that can be used directly.\n\n## Usage\n\nPecanPy operates in three different modes – `PreComp`, `SparseOTF`, and `DenseOTF` – that are optimized for networks of different sizes and densities; `PreComp` for networks that are small (≤10k nodes; any density), `SparseOTF` for networks that are large and sparse (>10k nodes; ≤10% of edges), and `DenseOTF` for networks that are large and dense (>10k nodes; >10% of edges). These modes appropriately take advantage of compact/dense graph data structures, precomputing transition probabilities, and computing 2nd-order transition probabilities during walk generation to achieve significant improvements in performance.\n\n### Example\n\nTo run *node2vec* on Zachary's karate club network using `SparseOTF` mode, execute the following command from the project home directory:\n\n```bash\npecanpy --input demo/karate.edg --output demo/karate.emb --mode SparseOTF\n```\n\n### Node2vec+\n\nTo enable _node2vec+_, specify the `--extend` option.\n\n```bash\npecanpy --input demo/karate.edge --output demo/karate_n2vplus.emb --mode SparseOTF --extend\n```\n\n**Note**: _node2vec+_ is only beneficial for embedding _weighted_ graphs. For unweighted graphs, _node2vec+_ is equivalent to _node2vec_. The above example only serves as a demonstration of enabling _node2vec+_.\n\n### Demo\n\nExecute the following command for full demonstration:\n\n```bash\nsh demo/run_pecanpy\n```\n\n### Mode\n\nAs mentioned above, PecanPy contains three main modes for generating node2vec random walks,\neach of which is better optimized for different network sizes/densities:\n| Mode | Network size/density | Optimization |\n|:-----|:---------------------|:-------------|\n| `PreComp` | <10k nodes, <0.1% edges | Precompute second order transition probabilities, using CSR graph |\n| `SparseOTF` (default) | (≥10k nodes, ≥0.1% and <20% of edges) or (<10k nodes, ≥0.1% edges) | Transition probabilites computed on-the-fly, using CSR graph |\n| `DenseOTF` | >20% of edges | Transition probabilities computed on-the-fly, using dense matrix |\n\n#### Compatibility and recommendations\n\n| Mode | Weighted | ``p,q!=1`` | Node2vec+ | Speed | Use this if |\n|:-----|----------------|---------------|-----------|:------------|:--------|\n|``PreComp``|:white_check_mark:|:white_check_mark:|:white_check_mark:|:dash::dash:|The graph is small and sparse|\n|``SparseOTF``|:white_check_mark:|:white_check_mark:|:white_check_mark:|:dash:|The graph is sparse but not necessarily small|\n|``DenseOTF``|:white_check_mark:|:white_check_mark:|:white_check_mark:|:dash:|The graph is extremely dense|\n|``PreCompFirstOrder``|:white_check_mark:|:x:|:x:|:dash::dash:|Run with ``p = q = 1`` on weighted graph|\n|``FirstOrderUnweighted``|:x:|:x:|:x:|:dash::dash::dash:|Run with ``p = q = 1`` on unweighted graph|\n\n### Options\n\nCheck out the full list of options available using:\n```bash\npecanpy --help\n```\n\n### Input\n\nThe supported input is a network file as an edgelist `.edg` file (node id could be int or string):\n\n```\nnode1_id node2_id <weight_float, optional>\n```\n\nAnother supported input format (only for `DenseOTF`) is the numpy array `.npz` file. Run the following command to prepare a `.npz` file from a `.edg` file.\n\n```bash\npecanpy --input $input_edgelist --output $output_npz --task todense\n```\n\nThe default delimiter for `.edg` is tab space (`\\t`), you many change this by passing in the `--delimiter` option.\n\n### Output\n\nThe output file has *n+1* lines for graph with *n* vertices, with a header line of the following format:\n\n```\nnum_of_nodes dim_of_representation\n```\n\nThe following  next *n* lines are the representations of dimension *d* following the corresponding node ID:\n\n```\nnode_id dim_1 dim_2 ... dim_d\n```\n\n### Development Note\n\nRun `black src/pecanpy/` to automatically follow black code formatting.\nRun `tox -e flake8` and resolve suggestions before committing to ensure consistent code style.\n\n## Additional Information\n### Documentation\nDetailed documentation for PecanPy is available [here](https://pecanpy.readthedocs.io/).\n\n### Support\nFor support, please consider opening a GitHub issue and we will do our best to reply in a timely manner.\nAlternatively, if you would like to keep the conversation private, feel free to contact [Remy Liu](https://twitter.com/RemyLau3) at liurenmi@msu.edu.\n\n### License\nThis repository and all its contents are released under the [BSD 3-Clause License](https://opensource.org/licenses/BSD-3-Clause); See [LICENSE.md](https://github.com/krishnanlab/pecanpy/blob/master/LICENSE.md).\n\n### Citation\nIf you use PecanPy, please cite:\nLiu R, Krishnan A (2021) **PecanPy: a fast, efficient, and parallelized Python implementation of _node2vec_.** _Bioinformatics_ https://doi.org/10.1093/bioinformatics/btab202\n\nIf you find _node2vec+_ useful, please cite:\nLiu R, Hirn M, Krishnan A (2023) **Accurately modeling biased random walks on weighted graphs using _node2vec+_.** _Bioinformatics_ https://doi.org/10.1093/bioinformatics/btad047\n\n### Authors\nRenming Liu, Arjun Krishnan*\n>\\*General correspondence should be addressed to AK at arjun.krishnan@cuanschutz.edu.\n\n### Funding\nThis work was primarily supported by US National Institutes of Health (NIH) grants R35 GM128765 to AK and in part by MSU start-up funds to AK.\n\n### Acknowledgements\nWe thank [Christopher A. Mancuso](https://github.com/ChristopherMancuso), [Anna Yannakopoulos](http://yannakopoulos.com/), and the rest of the [Krishnan Lab](https://www.thekrishnanlab.org/team) for valuable discussions and feedback on the software and manuscript. Thanks to [Charles T. Hoyt](https://github.com/cthoyt) for making the software `pip` installable and for an extensive code review.\n\n### References\n\n**Original _node2vec_**\n* Grover, A. and Leskovec, J. (2016) node2vec: Scalable Feature Learning for Networks. ArXiv160700653 Cs Stat.\nOriginal _node2vec_ software and networks\n  * https://snap.stanford.edu/node2vec/ contains the original software and the networks (PPI, BlogCatalog, and Wikipedia) used in the original study (Grover and Leskovec, 2016).\n\n**Other networks**\n* Stark, C. et al. (2006) BioGRID: a general repository for interaction datasets. Nucleic Acids Res., 34, D535–D539.\n  * BioGRID human protein-protein interactions.\n\n* Szklarczyk, D. et al. (2015) STRING v10: protein–protein interaction networks, integrated over the tree of life. Nucleic Acids Res., 43, D447–D452.\n  * STRING predicted human gene interactions.\n\n* Greene, C.S. et al. (2015) Understanding multicellular function and disease with human tissue-specific networks. Nat. Genet., 47, 569–576.\n  * GIANT-TN is a generic genome-scale human gene network. GIANT-TN-c01 is a sub-network of GIANT-TN where edges with edge weight below 0.01 are discarded.\n\nBioGRID (Stark et al., 2006), STRING (Szklarczyk et al., 2015), and GIANT-TN (Greene et al., 2015) are available from https://doi.org/10.5281/zenodo.3352323.\n\n* Law, J.N. et al. (2019) Accurate and Efficient Gene Function Prediction using a Multi-Bacterial Network. bioRxiv, 646687.\n  * SSN200 is a cross-species network of proteins from 200 species with the edges representing protein sequence similarities. Downloaded from https://bioinformatics.cs.vt.edu/~jeffl/supplements/2019-fastsinksource/.\n"
  },
  {
    "path": "demo/karate.edg",
    "content": "1\t32\n1\t22\n1\t20\n1\t18\n1\t14\n1\t13\n1\t12\n1\t11\n1\t9\n1\t8\n1\t7\n1\t6\n1\t5\n1\t4\n1\t3\n1\t2\n2\t31\n2\t22\n2\t20\n2\t18\n2\t14\n2\t8\n2\t4\n2\t3\n3\t14\n3\t9\n3\t10\n3\t33\n3\t29\n3\t28\n3\t8\n3\t4\n4\t14\n4\t13\n4\t8\n5\t11\n5\t7\n6\t17\n6\t11\n6\t7\n7\t17\n9\t34\n9\t33\n9\t33\n10\t34\n14\t34\n15\t34\n15\t33\n16\t34\n16\t33\n19\t34\n19\t33\n20\t34\n21\t34\n21\t33\n23\t34\n23\t33\n24\t30\n24\t34\n24\t33\n24\t28\n24\t26\n25\t32\n25\t28\n25\t26\n26\t32\n27\t34\n27\t30\n28\t34\n29\t34\n29\t32\n30\t34\n30\t33\n31\t34\n31\t33\n32\t34\n32\t33\n33\t34\n"
  },
  {
    "path": "demo/reproducibility.sh",
    "content": "#!/bin/bash --login\n# reproducibility.sh\n# Test the reproducibility of PecanPy between runs.\n\nsource ~/.bashrc\n\nrs=100\nexport PYTHONHASHSEED=$rs\n\nconda activate pecanpy-dev\npecanpy --input karate.edg --output karate1.emd --mode FirstOrderUnweighted --workers 1 --random_state $rs\npecanpy --input karate.edg --output karate2.emd --mode FirstOrderUnweighted --workers 1 --random_state $rs\ncmp karate1.emd karate2.emd\nrm -f karate1.emd karate2.emd\n"
  },
  {
    "path": "demo/run_pecanpy",
    "content": "#!/bin/bash\ncd $(dirname $(realpath $0))\ncd ../\n\nset -v\n\n# run with PreComp mode (default)\npecanpy --input demo/karate.edg --output demo/karate.emb --verbose\n\n# run with SparseOTF mode\npecanpy --input demo/karate.edg --output demo/karate.emb --verbose --mode SparseOTF\n\n# run with DenseOTF mode\npecanpy --input demo/karate.edg --output demo/karate.emb --verbose --mode DenseOTF\n\n# convert and save edgelist as dense matrix\npecanpy --input demo/karate.edg --output demo/karate.npz --task todense\n\n# run with DenseOTF mode using dense array as input\npecanpy --input demo/karate.npz --output demo/karate.emb --verbose --mode DenseOTF\n\n# input parameters\npecanpy --help\n"
  },
  {
    "path": "docs/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the environment for the first two.\nSPHINXOPTS    ?=\nSPHINXBUILD   ?= sphinx-build\nSOURCEDIR     = source\nBUILDDIR      = build\n\n# Put it first so that \"make\" without argument is like \"make help\".\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\n.PHONY: help Makefile\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n"
  },
  {
    "path": "docs/requirements.txt",
    "content": "sphinx\nsphinx_rtd_theme\n"
  },
  {
    "path": "docs/source/conf.py",
    "content": "# Configuration file for the Sphinx documentation builder.\n#\n# This file only contains a selection of the most common options. For a full\n# list see the documentation:\n# https://www.sphinx-doc.org/en/master/usage/configuration.html\n# -- Path setup --------------------------------------------------------------\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#\nimport os\nimport sys\n\nsys.path.insert(0, os.path.abspath(\"../../src\"))\n\n\n# -- Project information -----------------------------------------------------\n\nproject = \"PecanPy\"\ncopyright = \"2020, Renming Liu and Arjun Krishnan\"\nauthor = \"Renming Liu and Arjun Krishnan\"\n\n# The full version, including alpha/beta/rc tags\nrelease = \"2.0.10-dev\"\n\n\n# -- General configuration ---------------------------------------------------\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    \"sphinx.ext.autodoc\",\n    \"sphinx.ext.intersphinx\",\n    \"sphinx.ext.todo\",\n    \"sphinx.ext.coverage\",\n    \"sphinx.ext.viewcode\",\n    \"sphinx.ext.napoleon\",\n]\n\n# Napoleon settings\nnapoleon_google_docstring = True\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = [\"_templates\"]\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\n# This pattern also affects html_static_path and html_extra_path.\nexclude_patterns = []\n\n\n# -- Options for HTML output -------------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\n#\nhtml_theme = \"sphinx_rtd_theme\"\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\n# html_static_path = ['_static']\n\n# Example configuration for intersphinx: refer to the Python standard library.\nintersphinx_mapping = {\n    \"python\": (\"https://docs.python.org/3\", None),\n    \"networkx\": (\"https://networkx.github.io/documentation/latest/\", None),\n}\n\nautodoc_member_order = \"bysource\"\nautoclass_content = \"both\"\n"
  },
  {
    "path": "docs/source/index.rst",
    "content": "Welcome to PecanPy's documentation\n==================================\n\n.. toctree::\n    :maxdepth: 2\n\n    pecanpy\n"
  },
  {
    "path": "docs/source/pecanpy.rst",
    "content": "PecanPy package\n===============\n\nCommand line interface\n----------------------\n\n.. automodule:: pecanpy.cli\n   :members:\n   :undoc-members:\n   :show-inheritance:\n\nGraph Data Structures\n---------------------\n\n.. automodule:: pecanpy.graph\n   :members:\n   :undoc-members:\n   :show-inheritance:\n\nNode2vec implementations\n------------------------\n\n.. automodule:: pecanpy.pecanpy\n   :members:\n   :undoc-members:\n   :show-inheritance:\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[build-system]\nrequires = [\"setuptools>=42.0\", \"wheel\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[tool.mypy]\nignore_missing_imports = true\nfollow_imports = \"skip\"\nplugins = [\n    \"numpy.typing.mypy_plugin\",\n]\n"
  },
  {
    "path": "requirements.txt",
    "content": "gensim==4.3.2\nnptyping==2.5.0\nnumba-progress==1.1.0\nnumba==0.58.1\nnumpy==1.23.2\nscipy<1.13  # triu import issue (https://stackoverflow.com/a/78279318/12519564)\ntyping_extensions==4.13.2\n"
  },
  {
    "path": "setup.cfg",
    "content": "[metadata]\nname = pecanpy\nversion = 2.0.10-dev\nauthor = Remy Liu\nauthor_email = liurenmi@msu.edu\ndescription = A parallelized, efficient, and accelerated node2vec\nlong_description = file: README.md\nlong_description_content_type = text/markdown\n\n# Links\nurl = https://github.com/krishnanlab/PecanPy\nproject_urls =\n    Documentation = https://pecanpy.readthedocs.io/\n\n# License\nlicense_files = file: LICENSE\nlicense = BSD 3-Clause License\n\n# Search tags\nclassifiers =\n    Development Status :: 5 - Production/Stable\n    Programming Language :: Python\n    Programming Language :: Python :: 3 :: Only\n    Programming Language :: Python :: 3.8\n    Programming Language :: Python :: 3.9\n    Programming Language :: Python :: 3.10\n    Programming Language :: Python :: 3.11\nkeywords =\n    Network Embedding\n\n[options]\ninstall_requires =\n    gensim>=4.1.0\n    numpy>=1.20.0\n    numba>=0.46.0\n    numba-progress>=0.0.2\n    nptyping>=2.0.0\n    typing_extensions>=4.0.1\n\nzip_safe = false\ninclude_package_data = True\npython_requires = >=3.8\n\n# Where is my code\npackages = find:\npackage_dir =\n    = src\n\n[options.extras_require]\ndev =\n    bump2version==1.0.1\n    mypy==1.9.0\n    parameterized==0.9.0\n    pre-commit==3.5.0; python_version < \"3.9\"\n    pre-commit==4.2.0; python_version >= \"3.9\"\n    pytest-cov==5.0.0\n    pytest-xdist==3.6.1\n    pytest==8.3.5\n    tox==4.25.0\n\n[options.packages.find]\nwhere = src\n\n[options.entry_points]\nconsole_scripts =\n    pecanpy = pecanpy.cli:main\n"
  },
  {
    "path": "setup.py",
    "content": "\"\"\"Setup module.\"\"\"\nimport setuptools\n\nif __name__ == \"__main__\":\n    setuptools.setup()\n"
  },
  {
    "path": "src/pecanpy/__init__.py",
    "content": "\"\"\"PecanPy: parallelized, efficient, and accelerated node2vec.\"\"\"\nfrom . import graph\nfrom . import pecanpy\n\nversion = \"2.0.10-dev\"\n__all__ = [\"graph\", \"pecanpy\"]\n"
  },
  {
    "path": "src/pecanpy/cli.py",
    "content": "\"\"\"Command line utility for PecanPy.\n\nThis is the command line interface for the ``pecanpy`` package.\n\nExamples:\n    Run PecanPy in command line using ``PreComp`` mode to embed the karate network::\n\n        $ pecanpy --input demo/karate.edg --ouptut demo/karate.emb --mode PreComp\n\n    Checkout the full list of parameters by::\n\n        $ pecanpy --help\n\n\"\"\"\nimport argparse\nimport warnings\n\nimport numba\nimport numpy as np\nfrom gensim.models import Word2Vec\n\nfrom . import graph\nfrom . import pecanpy\nfrom .wrappers import Timer\n\n\ndef parse_args():\n    \"\"\"Parse node2vec arguments.\"\"\"\n    parser = argparse.ArgumentParser(\n        description=\"Run pecanpy, a parallelized, efficient, and accelerated \"\n        \"Python implementation of node2vec\",\n        formatter_class=argparse.ArgumentDefaultsHelpFormatter,\n    )\n\n    parser.add_argument(\n        \"--input\",\n        required=True,\n        help=\"Input graph (.edg or .npz) file path.\",\n    )\n\n    parser.add_argument(\n        \"--output\",\n        required=True,\n        help=\"Output embeddings file path. Save as .npz file if the specified \"\n        \"file path ends with .npz, otherwise save as a text file using the \"\n        \"gensim save_word2vec_format method.\",\n    )\n\n    parser.add_argument(\n        \"--task\",\n        default=\"pecanpy\",\n        choices=[\"pecanpy\", \"tocsr\", \"todense\"],\n        help=\"Task to be performed.\",\n    )\n\n    parser.add_argument(\n        \"--mode\",\n        default=\"SparseOTF\",\n        choices=[\n            \"DenseOTF\",\n            \"FirstOrderUnweighted\",\n            \"PreComp\",\n            \"PreCompFirstOrder\",\n            \"SparseOTF\",\n        ],\n        help=\"PecanPy execution mode.\",\n    )\n\n    parser.add_argument(\n        \"--dimensions\",\n        type=int,\n        default=128,\n        help=\"Number of dimensions.\",\n    )\n\n    parser.add_argument(\n        \"--walk-length\",\n        type=int,\n        default=80,\n        help=\"Length of walk per source.\",\n    )\n\n    parser.add_argument(\n        \"--num-walks\",\n        type=int,\n        default=10,\n        help=\"Number of walks per source.\",\n    )\n\n    parser.add_argument(\n        \"--window-size\",\n        type=int,\n        default=10,\n        help=\"Context size for optimization.\",\n    )\n\n    parser.add_argument(\n        \"--epochs\",\n        type=int,\n        default=1,\n        help=\"Number of epochs in SGD when training Word2Vec\",\n    )\n\n    parser.add_argument(\n        \"--workers\",\n        type=int,\n        default=0,\n        help=\"Number of parallel workers (0 to use all available threads).\",\n    )\n\n    parser.add_argument(\n        \"--p\",\n        type=float,\n        default=1,\n        help=\"Return hyperparameter.\",\n    )\n\n    parser.add_argument(\n        \"--q\",\n        type=float,\n        default=1,\n        help=\"Inout hyperparameter.\",\n    )\n\n    parser.add_argument(\n        \"--weighted\",\n        action=\"store_true\",\n        help=\"Boolean specifying (un)weighted.\",\n    )\n\n    parser.add_argument(\n        \"--directed\",\n        action=\"store_true\",\n        help=\"Graph is (un)directed.\",\n    )\n\n    parser.add_argument(\n        \"--verbose\",\n        action=\"store_true\",\n        help=\"Print out training details\",\n    )\n\n    parser.add_argument(\n        \"--extend\",\n        action=\"store_true\",\n        help=\"Use node2vec+ extension\",\n    )\n\n    parser.add_argument(\n        \"--gamma\",\n        type=float,\n        default=0,\n        help=\"Noisy edge threshold parameter.\",\n    )\n\n    parser.add_argument(\n        \"--random_state\",\n        type=int,\n        default=None,\n        help=\"Random seed for generating random walks.\",\n    )\n\n    parser.add_argument(\n        \"--delimiter\",\n        type=str,\n        default=\"\\t\",\n        help=\"Delimiter used between node IDs.\",\n    )\n\n    parser.add_argument(\n        \"--implicit_ids\",\n        action=\"store_true\",\n        help=\"If set, use canonical node ordering for the node IDs.\",\n    )\n\n    return parser.parse_args()\n\n\ndef check_mode(g, args):\n    \"\"\"Check mode selection.\n\n    Give recommendation to user for pecanpy mode based on graph size and density.\n\n    \"\"\"\n    mode = args.mode\n    weighted = args.weighted\n    p = args.p\n    q = args.q\n\n    # Check unweighted first order random walk usage\n    if mode == \"FirstOrderUnweighted\":\n        if not p == q == 1 or weighted:\n            raise ValueError(\n                f\"FirstOrderUnweighted only works when weighted = False and \"\n                f\"p = q = 1, got {weighted=}, {p=}, {q=}\",\n            )\n        return\n\n    if mode != \"FirstOrderUnweighted\" and p == q == 1 and not weighted:\n        warnings.warn(\n            \"When p = 1 and q = 1 with unweighted graph, it is highly \"\n            f\"recommended to use FirstOrderUnweighted over {mode} (current \"\n            \"selection). The runtime could be improved greatly with improved  \"\n            \"memory usage.\",\n            stacklevel=2,\n        )\n        return\n\n    # Check first order random walk usage\n    if mode == \"PreCompFirstOrder\":\n        if not p == q == 1:\n            raise ValueError(\n                f\"PreCompFirstOrder only works when p = q = 1, got {p=}, {q=}\",\n            )\n        return\n\n    if mode != \"PreCompFirstOrder\" and p == 1 == q:\n        warnings.warn(\n            \"When p = 1 and q = 1, it is highly recommended to use \"\n            f\"PreCompFirstOrder over {mode} (current selection). The runtime \"\n            \"could be improved greatly with low memory usage.\",\n            stacklevel=2,\n        )\n        return\n\n    # Check network density and recommend appropriate mode\n    g_size = g.num_nodes\n    g_dens = g.density\n    if (g_dens >= 0.2) & (mode != \"DenseOTF\"):\n        warnings.warn(\n            f\"Network density = {g_dens:.3f} (> 0.2), it is recommended to use \"\n            f\"DenseOTF over {mode} (current selection)\",\n            stacklevel=2,\n        )\n    if (g_dens < 0.001) & (g_size < 10000) & (mode != \"PreComp\"):\n        warnings.warn(\n            f\"Network density = {g_dens:.2e} (< 0.001) with {g_size} nodes \"\n            f\"(< 10000), it is recommended to use PreComp over {mode} (current \"\n            \"selection)\",\n            stacklevel=2,\n        )\n    if (g_dens >= 0.001) & (g_dens < 0.2) & (mode != \"SparseOTF\"):\n        warnings.warn(\n            f\"Network density = {g_dens:.3f}, it is recommended to use \"\n            f\"SparseOTF over {mode} (current selection)\",\n            stacklevel=2,\n        )\n    if (g_dens < 0.001) & (g_size >= 10000) & (mode != \"SparseOTF\"):\n        warnings.warn(\n            f\"Network density = {g_dens:.3f} (< 0.001) with {g_size} nodes \"\n            f\"(>= 10000), it is recommended to use SparseOTF over {mode} \"\n            \"(current selection)\",\n            stacklevel=2,\n        )\n\n\n@Timer(\"load Graph\")\ndef read_graph(args):\n    \"\"\"Read input network to memory.\n\n    Depending on the mode selected, reads the network either in CSR\n    representation (``PreComp`` and ``SparseOTF``) or 2d numpy array\n    (``DenseOTF``).\n\n    \"\"\"\n    path = args.input\n    output = args.output\n    p = args.p\n    q = args.q\n    workers = args.workers\n    verbose = args.verbose\n    weighted = args.weighted\n    directed = args.directed\n    extend = args.extend\n    gamma = args.gamma\n    random_state = args.random_state\n    mode = args.mode\n    task = args.task\n    delimiter = args.delimiter\n    implicit_ids = args.implicit_ids\n\n    if directed and extend:\n        raise NotImplementedError(\"Node2vec+ not implemented for directed graph yet.\")\n\n    if extend and not weighted:\n        print(\"NOTE: node2vec+ is equivalent to node2vec for unweighted graphs.\")\n\n    if task in [\"tocsr\", \"todense\"]:  # perform conversion then save and exit\n        g = graph.SparseGraph() if task == \"tocsr\" else graph.DenseGraph()\n        g.read_edg(path, weighted, directed, delimiter)\n        g.save(output)\n        exit()\n\n    pecanpy_mode = getattr(pecanpy, mode, None)\n    g = pecanpy_mode(p, q, workers, verbose, extend, gamma, random_state)\n\n    if path.endswith(\".npz\"):\n        g.read_npz(path, weighted, implicit_ids=implicit_ids)\n    else:\n        g.read_edg(path, weighted, directed, delimiter)\n\n    check_mode(g, args)\n\n    return g\n\n\n@Timer(\"train embeddings\")\ndef learn_embeddings(args, walks):\n    \"\"\"Learn embeddings by optimizing the Skipgram objective using SGD.\"\"\"\n    model = Word2Vec(\n        walks,\n        vector_size=args.dimensions,\n        window=args.window_size,\n        min_count=0,\n        sg=1,\n        workers=args.workers,\n        epochs=args.epochs,\n        seed=args.random_state,\n    )\n\n    output_path = args.output\n    if output_path.endswith(\".npz\"):\n        np.savez(output_path, IDs=model.wv.index_to_key, data=model.wv.vectors)\n    else:\n        model.wv.save_word2vec_format(output_path)\n\n\n@Timer(\"pre-compute transition probabilities\")\ndef preprocess(g):\n    \"\"\"Preprocessing transition probabilities with timer.\"\"\"\n    g.preprocess_transition_probs()\n\n\n@Timer(\"generate walks\")\ndef simulate_walks(args, g):\n    \"\"\"Simulate random walks with timer.\"\"\"\n    return g.simulate_walks(args.num_walks, args.walk_length)\n\n\ndef main():\n    \"\"\"Pipeline for representational learning for all nodes in a graph.\"\"\"\n    args = parse_args()\n\n    if args.workers == 0:\n        args.workers = numba.config.NUMBA_DEFAULT_NUM_THREADS\n    numba.set_num_threads(args.workers)\n\n    g = read_graph(args)\n    preprocess(g)\n    walks = simulate_walks(args, g)\n    learn_embeddings(args, walks)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "src/pecanpy/experimental.py",
    "content": "\"\"\"Experimental features.\"\"\"\nimport numpy as np\nfrom numba import njit\nfrom pecanpy.pecanpy import Base\nfrom pecanpy.rw.dense_rw import DenseRWGraph\n\n\nclass Node2vecPlusPlus(Base, DenseRWGraph):\n    \"\"\"Continuous extension of node2vec+ with DenseOTF framework.\n\n    In node2vec+ (see `DenseRWGraph.get_extended_normalized_probs`), there is\n    discontinuous region of the bias-factor (alpha). More specifically, the\n    transition between the noisy-edge region (w1 < 1 and w2 < 1, where w1 is\n    the normalized edge weight connecting from current to the previous node,\n    and w2 is similarly defined for the edge weight connecting from the next\n    to the previous node), and the \"in-out\" region (w1 > 1 or w2 > 1).\n\n    This continuous extension version of node2vec+, i.e., node2vec++, aims to\n    provide continuity to those regions by parameterizing the bias-factor as\n    a continuous function of w1 and w2. The basic idea is to use w2 to control\n    the interpolation between 1 and 1 / q as before, but in addition, use w1\n    to parameterize the curvature of the interpolation, so as w1 approaches\n    zero, the bias-factor goes to min{1, 1 / q} (note that previously, the\n    bias-factor is set to min{1, 1 / q} whenever w1 falls below one).\n\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        Base.__init__(self, *args, **kwargs)\n\n    def get_move_forward(self):\n        \"\"\"Wrap ``move_forward``.\"\"\"\n        data = self.data\n        nonzero = self.nonzero\n        p = self.p\n        q = self.q\n\n        noise_thresholds = self.get_noise_thresholds()\n        get_normalized_probs = self.get_normalized_probs\n\n        @njit(nogil=True)\n        def move_forward(cur_idx, prev_idx=None):\n            \"\"\"Move to next node.\"\"\"\n            normalized_probs = get_normalized_probs(\n                data,\n                nonzero,\n                p,\n                q,\n                cur_idx,\n                prev_idx,\n                noise_thresholds,\n            )\n            cdf = np.cumsum(normalized_probs)\n            choice = np.searchsorted(cdf, np.random.random())\n            nbrs = np.where(nonzero[cur_idx])[0]\n\n            return nbrs[choice]\n\n        return move_forward\n\n    @staticmethod\n    @njit(nogil=True)\n    def get_normalized_probs(\n        data,\n        nonzero,\n        p,\n        q,\n        cur_idx,\n        prev_idx,\n        noise_threshold_ary,\n    ):\n        \"\"\"Calculate node2vec++ transition probabilities.\"\"\"\n        cur_nbrs_ind = nonzero[cur_idx]\n        cur_nbrs_weight = data[cur_idx].copy()\n\n        if prev_idx is not None:  # 2nd order biased walks\n            prev_nbrs_weight = data[prev_idx].copy()\n\n            # Note: we assume here the network is undirected, hence the edge\n            # weight connecting the next to prev is the same as the reverse.\n            out_ind = cur_nbrs_ind & (prev_nbrs_weight < noise_threshold_ary)\n            out_ind[prev_idx] = False  # exclude previous state from out biases\n\n            t = prev_nbrs_weight[out_ind] / noise_threshold_ary[out_ind]\n            # Determine whether to use '1 - t' or 't' depending on whether q\n            # is less than or greater than one so that alpha is suppressed to\n            # min{1, 1 / q} as w1 approaches 0.\n            t = 1 - t.clip(0, 1) if q < 1 else t.clip(0, 1)\n            b = cur_nbrs_weight[out_ind] / noise_threshold_ary[out_ind]\n\n            # compute out biases\n            scale = np.abs(1 - 1 / q)\n            offset = np.minimum(1, 1 / q)\n            alpha = t * b / (1 + (b - 1)) * scale + offset\n\n            cur_nbrs_weight[out_ind] *= alpha  # apply out biases\n            cur_nbrs_weight[prev_idx] /= p  # apply the return bias\n\n        unnormalized_probs = cur_nbrs_weight[cur_nbrs_ind]\n        normalized_probs = unnormalized_probs / unnormalized_probs.sum()\n\n        return normalized_probs\n"
  },
  {
    "path": "src/pecanpy/graph.py",
    "content": "\"\"\"Lite graph objects used by pecanpy.\"\"\"\nimport warnings\n\nimport numpy as np\n\nfrom .typing import AdjMat\nfrom .typing import AdjNonZeroMat\nfrom .typing import CSR\nfrom .typing import Dict\nfrom .typing import Float32Array\nfrom .typing import Iterator\nfrom .typing import List\nfrom .typing import Optional\nfrom .typing import Sequence\nfrom .typing import Tuple\nfrom .typing import Uint32Array\n\n\nclass BaseGraph:\n    \"\"\"Base Graph object.\n\n    Handles node id and provides general properties including num_nodes,\n    and density. The num_edges property is to be specified by the derived\n    graph objects.\n\n    \"\"\"\n\n    def __init__(self):\n        self._node_ids: List[str] = []\n        self._node_idmap: Dict[str, int] = {}  # id -> index\n\n    @property\n    def nodes(self) -> List[str]:\n        \"\"\"Return the list of node IDs.\"\"\"\n        return self._node_ids\n\n    @property\n    def num_nodes(self) -> int:\n        \"\"\"Return the number of nodes in the graph.\"\"\"\n        return len(self.nodes)\n\n    @property\n    def num_edges(self) -> int:\n        \"\"\"Return the number of edges in the graph.\"\"\"\n        raise NotImplementedError(\n            f\"{self.__class__.__name__} does not have num_edges, use the \"\n            f\"derived classes like SparseGraph and DenseGraph instead.\",\n        )\n\n    @property\n    def density(self) -> float:\n        \"\"\"Return the edge density of the graph.\"\"\"\n        return self.num_edges / self.num_nodes / (self.num_nodes - 1)\n\n    def set_node_ids(\n        self,\n        node_ids: Optional[Sequence[str]],\n        implicit_ids: bool = False,\n        num_nodes: Optional[int] = None,\n    ):\n        \"\"\"Update ID list and mapping.\n\n        Set _node_ids given the input node IDs and also set the corresponding\n        _node_idmap based on it, which maps from node ID to the index.\n\n        Args:\n            node_ids (:obj:`list` of :obj:`str`, optional): List of node IDs to\n                use. If not available, will implicitly set node IDs to the\n                canonical ordering of nodes with a warning message, which is\n                suppressed if `implicit_ids` is set to True.\n            implicit_ids (bool): Implicitly set the node IDs to the canonical\n                node ordering. If set to False and node IDs are not available,\n                it will also set implicit node IDs, but with a warning message.\n                The warning message can be suppressed if `implicit_ids` is set\n                to True as a confirmation of the behavior.\n            num_nodes (int, optional): Number of nodes, used when try to set\n                implicit node IDs.\n\n        \"\"\"\n        if (node_ids is not None) and (not implicit_ids):\n            self._node_ids = list(node_ids)\n        elif num_nodes is None:\n            raise ValueError(\n                \"Need to specify `num_nodes` when setting implicit node IDs.\",\n            )\n        else:\n            self.set_node_ids(list(map(str, range(num_nodes))))\n            if not implicit_ids:\n                warnings.warn(\n                    \"WARNING: Implicitly set node IDs to the canonical node \"\n                    \"ordering due to missing IDs field in the raw CSR npz \"\n                    \"file. This warning message can be suppressed by setting \"\n                    \"implicit_ids to True in the read_npz function call, or \"\n                    \"by setting the --implicit_ids flag in the CLI\",\n                    stacklevel=2,\n                )\n        self._node_idmap = {j: i for i, j in enumerate(self._node_ids)}\n\n    def get_has_nbrs(self):\n        \"\"\"Abstract method to be specified by derived classes.\"\"\"\n        raise NotImplementedError\n\n    def get_move_forward(self):\n        \"\"\"Abstract method to be specified by derived classes.\"\"\"\n        raise NotImplementedError\n\n\nclass AdjlstGraph(BaseGraph):\n    \"\"\"Adjacency list Graph object used for reading/writing edge list files.\n\n    Sparse Graph object that stores graph as adjacency list.\n\n    Note:\n        AdjlstGraph is only used for reading/writing edge list files and do not\n        support random walk computations since Numba njit do not work with\n        Python data structures like list and dict.\n\n    Examples:\n        Read ``.edg`` file and create ``SparseGraph`` object using\n        ``.read_edg`` method.\n\n        >>> from pecanpy.graph import AdjlstGraph\n        >>>\n        >>> # initialize SparseGraph object\n        >>> g = AdjlstGraph()\n        >>>\n        >>> # read graph from edgelist\n        >>> g.read(path_to_edg_file, weighted=True, directed=False)\n        >>>\n        >>> indptr, indices, data = g.to_csr()  # convert to csr\n        >>>\n        >>> dense_mat = g.to_dense()  # convert to dense adjacency matrix\n        >>>\n        >>> g.save(edg_outpath)  # save the graph to an edge list file\n\n    \"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self._data: List[Dict[int, float]] = []  # list of nbrs idx -> weights\n        self._num_edges: int = 0\n\n    @property\n    def edges_iter(self) -> Iterator[Tuple[int, int, float]]:\n        \"\"\"Return an iterator that iterates over all edges.\"\"\"\n        for head, head_nbrs in enumerate(self._data):\n            for tail in sorted(head_nbrs):\n                yield head, tail, head_nbrs[tail]\n\n    @property\n    def edges(self) -> List[Tuple[int, int, float]]:\n        \"\"\"Return a list of triples (head, tail, weight) representing edges.\"\"\"\n        return list(self.edges_iter)\n\n    @property\n    def num_edges(self):\n        \"\"\"Return the number of edges in the graph.\"\"\"\n        return self._num_edges\n\n    @staticmethod\n    def _read_edge_line(\n        edge_line: str,\n        weighted: bool,\n        delimiter: str,\n    ) -> Tuple[str, str, float]:\n        \"\"\"Read a line from the edge list file.\"\"\"\n        terms = edge_line.strip().split(delimiter)\n        id1, id2 = terms[0].strip(), terms[1].strip()\n\n        weight = 1.0\n        if weighted:\n            if len(terms) != 3:\n                raise ValueError(\n                    f\"Expecting three columns in the edge list file for a \"\n                    f\"weighted graph, got {len(terms)} instead: {edge_line!r}\",\n                )\n            weight = float(terms[-1])\n\n        return id1, id2, weight\n\n    @staticmethod\n    def _is_valid_edge_weight(id1: str, id2: str, weight: float) -> bool:\n        \"\"\"Check if the edge weight is non-negative.\"\"\"\n        if weight <= 0:\n            edg_str = f\"w({id1},{id2}) = {weight}\"\n            warnings.warn(\n                f\"Non-positive edge ignored: {edg_str}\",\n                RuntimeWarning,\n                stacklevel=2,\n            )\n            return False\n        return True\n\n    def _check_edge_existence(\n        self,\n        id1: str,\n        id2: str,\n        idx1: int,\n        idx2: int,\n        weight: float,\n    ):\n        \"\"\"Check if an edge exists.\n\n        If the edge to be added already exists and the new edge weight is\n        different from the existing edge weights, print warning message.\n\n        \"\"\"\n        if idx2 in self._data[idx1] and self._data[idx1][idx2] != weight:\n            warnings.warn(\n                f\"edge from {id1} to {id2} exists, with \"\n                f\"value of {self._data[idx1][idx2]:.2f}. \"\n                f\"Now overwrite to {weight:.2f}.\",\n                RuntimeWarning,\n                stacklevel=2,\n            )\n\n    def get_node_idx(self, node_id: str) -> int:\n        \"\"\"Get index of the node and create new node when necessary.\"\"\"\n        self.add_node(node_id)\n        return self._node_idmap[node_id]\n\n    def add_node(self, node_id: str):\n        \"\"\"Create a new node.\n\n        Add a new node to the graph if not already existing, by updating the\n        ID list, ID map, and the adjacency list data. Otherwise pass through\n        without further actions.\n\n        Note:\n            Does not raise error even if the node alrealy exists.\n\n        \"\"\"\n        if node_id not in self._node_idmap:\n            self._node_idmap[node_id] = self.num_nodes\n            self.nodes.append(node_id)\n            self._data.append({})\n\n    def _add_edge_from_idx(self, idx1: int, idx2: int, weight: float):\n        \"\"\"Add an edge based on the head and tail node index with weight.\"\"\"\n        self._data[idx1][idx2] = weight\n        self._num_edges += 1\n\n    def add_edge(\n        self,\n        id1: str,\n        id2: str,\n        weight: float = 1.0,\n        directed: bool = False,\n    ):\n        \"\"\"Add an edge to the graph.\n\n        Note:\n            Non-positive edges are ignored.\n\n        Args:\n            id1 (str): first node id.\n            id2 (str): second node id.\n            weight (float): the edge weight, default is 1.0\n            directed (bool): whether the edge is directed or not.\n\n        \"\"\"\n        if self._is_valid_edge_weight(id1, id2, weight):\n            idx1, idx2 = map(self.get_node_idx, (id1, id2))\n            self._check_edge_existence(id1, id2, idx1, idx2, weight)\n\n            self._add_edge_from_idx(idx1, idx2, weight)\n            if not directed:\n                self._add_edge_from_idx(idx2, idx1, weight)\n\n    def read(\n        self,\n        path: str,\n        weighted: bool,\n        directed: bool,\n        delimiter: str = \"\\t\",\n    ):\n        \"\"\"Read an edgelist file and create sparse graph.\n\n        Note:\n            Implicitly discard zero weighted edges; if the same edge is defined\n            multiple times with different edge weights, then the last specified\n            weight will be used (warning for such behavior will be printed).\n\n        Args:\n            path (str): path to edgelist file, where the file is tab\n                separated and contains 2 or 3 columns depending on whether\n                the input graph is weighted, where the the first column\n                contains the source nodes and the second column contains the\n                destination nodes that interact with the corresponding source\n                nodes.\n            weighted (bool): whether the graph is weighted. If unweighted,\n                only two columns are expected in the edgelist file, and the\n                edge weights are implicitly set to 1 for all interactions. If\n                weighted, a third column encoding the weight of the interaction\n                in numeric value is expected.\n            directed (bool): whether the graph is directed, if undirected, the\n                edge connecting from destination node to source node is created\n                with same edge weight from source node to destination node.\n            delimiter (str): delimiter of the edge list file, default is tab.\n\n        \"\"\"\n        with open(path, encoding=\"utf-8\") as f:\n            for edge_line in f:\n                edge = self._read_edge_line(edge_line, weighted, delimiter)\n                self.add_edge(*edge, directed)\n\n    def save(self, path: str, unweighted: bool = False, delimiter: str = \"\\t\"):\n        \"\"\"Save AdjLst as an ``.edg`` edge list file.\n\n        Args:\n            unweighted (bool): If set to True, only write two columns,\n                corresponding to the head and tail nodes of the edges, and\n                ignore the edge weights (default: :obj:`False`).\n            delimiter (str): Delimiter for separating fields.\n\n        \"\"\"\n        with open(path, \"w\", encoding=\"utf-8\") as f:\n            for h, t, w in self.edges_iter:\n                h_id, t_id = self.nodes[h], self.nodes[t]\n                terms = (h_id, t_id) if unweighted else (h_id, t_id, str(w))\n                f.write(f\"{delimiter.join(terms)}\\n\")\n\n    def to_csr(self) -> CSR:\n        \"\"\"Construct compressed sparse row matrix.\"\"\"\n        indptr: Uint32Array = np.zeros(len(self.nodes) + 1, dtype=np.uint32)\n        for i, row_data in enumerate(self._data):\n            indptr[i + 1] = indptr[i] + len(row_data)\n\n        # last element of indptr indicates the total number of nonzero entries\n        indices = np.zeros(indptr[-1], dtype=np.uint32)\n        data = np.zeros(indptr[-1], dtype=np.float32)\n\n        for i, nbrs in enumerate(self._data):\n            if len(nbrs) == 0:\n                continue\n            new_indices, new_data = zip(*[(j, nbrs[j]) for j in sorted(nbrs)])\n            chunk = slice(indptr[i], indptr[i + 1])\n            indices[chunk] = np.array(new_indices, dtype=np.uint32)\n            data[chunk] = np.array(new_data, dtype=np.float32)\n\n        return indptr, indices, data\n\n    def to_dense(self) -> AdjMat:\n        \"\"\"Construct dense adjacency matrix.\n\n        Note:\n            This method does not return a DenseGraph object, but instead returns\n            a dense adjacency matrix as NDArray, where the index is the same\n            as that of ``nodes``.\n\n        Return:\n            NDArray: Full adjacency matrix as 2d numpy array.\n\n        \"\"\"\n        n_nodes = len(self.nodes)\n        mat = np.zeros((n_nodes, n_nodes))\n\n        for src_node, src_nbrs in enumerate(self._data):\n            for dst_node in src_nbrs:\n                mat[src_node, dst_node] = src_nbrs[dst_node]\n\n        return mat\n\n    @classmethod\n    def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs):\n        \"\"\"Construct graph using adjacency matrix and node IDs.\n\n        Args:\n            adj_mat(NDArray): 2D numpy array of adjacency matrix\n            node_ids(:obj:`list` of str): node ID list\n\n        Return:\n            An adjacency graph object representing the adjacency matrix.\n\n        \"\"\"\n        g = cls(**kwargs)\n\n        # Setup node idmap in the order of node_ids\n        for node_id in node_ids:\n            g.add_node(node_id)\n\n        # Fill in edge data\n        for idx1, idx2 in zip(*np.where(adj_mat != 0)):\n            g._add_edge_from_idx(idx1, idx2, adj_mat[idx1, idx2])\n\n        return g\n\n\nclass SparseGraph(BaseGraph):\n    \"\"\"Sparse Graph object that stores graph as adjacency list.\n\n    Examples:\n        Read ``.edg`` file and create ``SparseGraph`` object using\n        ``.read_edg`` method.\n\n        >>> from pecanpy.graph import SparseGraph\n        >>>\n        >>> # initialize SparseGraph object\n        >>> g = SparseGraph()\n        >>>\n        >>> # read graph from edgelist\n        >>> g.read_edg(path_to_edg_file, weighted=True, directed=False)\n        >>>\n        >>> # save the csr graph as npz file to be used later\n        >>> g.save(npz_outpath)\n\n    \"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self.data: Optional[Float32Array] = None\n        self.indptr: Optional[Uint32Array] = None\n        self.indices: Optional[Uint32Array] = None\n\n    @property\n    def num_edges(self) -> int:\n        \"\"\"Return the number of edges in the graph.\"\"\"\n        if self.indptr is not None:\n            return self.indptr[-1]\n        else:\n            raise ValueError(\"Empty graph.\")\n\n    def read_edg(\n        self,\n        path: str,\n        weighted: bool,\n        directed: bool,\n        delimiter: str = \"\\t\",\n    ):\n        \"\"\"Create CSR sparse graph from edge list.\n\n        First create ``AdjlstGraph`` by reading the edge list file, and then\n        convert to ``SparseGraph`` via ``to_csr``.\n\n        Args:\n            path (str): path to edgelist file.\n            weighted (bool): whether the graph is weighted.\n            directed (bool): whether the graph is directed.\n            delimiter (str): delimiter used between node IDs.\n\n        \"\"\"\n        g = AdjlstGraph()\n        g.read(path, weighted, directed, delimiter)\n        self.set_node_ids(g.nodes)\n        self.indptr, self.indices, self.data = g.to_csr()\n\n    def read_npz(self, path: str, weighted: bool, implicit_ids: bool = False):\n        \"\"\"Directly read a CSR sparse graph.\n\n        Note:\n            To generate a CSR file compatible with PecanPy, first load the graph\n                as a sparse graph using the SparseGraph (with ``csr=True``).\n                Then save the sparse graph to a csr file using the ``save``\n                method from ``SparseGraph``. The saved ``.npz`` file can then\n                be loaded directly by ``SparseGraph`` later.\n\n        Args:\n            path (str): path to the csr file, which is an npz file with four\n                arrays with keys 'IDs', 'data', 'indptr', 'indices', which\n                correspond to the node IDs, the edge weights, the offset array\n                for each node, and the indices of the edges.\n            weighted (bool): whether the graph is weighted, if unweighted,\n                all edge weights will be converted to 1.\n            directed (bool): not used, for compatibility with ``SparseGraph``.\n            implicit_ids (bool): Implicitly set the node IDs to the canonical\n                node ordering from the CSR graph. If unset and the `IDs` field\n                is not found in the input CSR graph, a warning message will be\n                displayed on screen. The missing `IDs` field can happen, for\n                example, when the user uses the CSR graph prepared by\n                `scipy.sparse.csr`.\n\n        \"\"\"\n        raw = np.load(path)\n        self.indptr = raw[\"indptr\"].astype(np.uint32)\n        self.indices = raw[\"indices\"].astype(np.uint32)\n        self.data = raw[\"data\"].astype(np.float32)\n        if self.data is None:\n            raise ValueError(\"Adjacency matrix data not found.\")\n        elif not weighted:\n            self.data[:] = 1.0  # overwrite edge weights with constant\n\n        self.set_node_ids(\n            raw.get(\"IDs\"),\n            implicit_ids=implicit_ids,\n            num_nodes=int(self.indptr.size - 1),\n        )\n\n    def save(self, path: str):\n        \"\"\"Save CSR as ``.csr.npz`` file.\"\"\"\n        np.savez(\n            path,\n            IDs=self.nodes,\n            data=self.data,\n            indptr=self.indptr,\n            indices=self.indices,\n        )\n\n    @classmethod\n    def from_adjlst_graph(cls, adjlst_graph, **kwargs):\n        \"\"\"Construct csr graph from adjacency list graph.\n\n        Args:\n            adjlst_graph (:obj:`pecanpy.graph.AdjlstGraph`): Adjacency list\n                graph to be converted.\n\n        \"\"\"\n        g = cls(**kwargs)\n        g.set_node_ids(adjlst_graph.nodes)\n        g.indptr, g.indices, g.data = adjlst_graph.to_csr()\n        return g\n\n    @classmethod\n    def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs):\n        \"\"\"Construct csr graph using adjacency matrix and node IDs.\n\n        Note:\n            Only consider positive valued edges.\n\n        Args:\n            adj_mat(NDArray): 2D numpy array of adjacency matrix\n            node_ids(:obj:`list` of str): node ID list\n\n        \"\"\"\n        g = cls(**kwargs)\n        g.set_node_ids(node_ids)\n        adjlst_graph = AdjlstGraph.from_mat(adj_mat, node_ids)\n        g.indptr, g.indices, g.data = adjlst_graph.to_csr()\n        return g\n\n\nclass DenseGraph(BaseGraph):\n    \"\"\"Dense Graph object that stores graph as array.\n\n    Examples:\n        Read ``.npz`` files and create ``DenseGraph`` object using ``read_npz``\n\n        >>> from pecanpy.graph import DenseGraph\n        >>>\n        >>> g = DenseGraph() # initialize DenseGraph object\n        >>>\n        >>> g.read_npz(paht_to_npz_file, weighted=True, directed=False)\n\n        Read ``.edg`` files and create ``DenseGraph`` object using ``read_edg``\n\n        >>> from pecanpy.graph import DenseGraph\n        >>>\n        >>> # initialize DenseGraph object\n        >>> g = DenseGraph()\n        >>>\n        >>> # read graph from edgelist\n        >>> g.read_edg(path_to_edg_file, weighted=True, directed=False)\n        >>>\n        >>> # save the dense graph as npz file to be used later\n        >>> g.save(npz_outpath)\n\n    \"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self._data: Optional[AdjMat] = None\n        self._nonzero: Optional[AdjNonZeroMat] = None\n\n    @property\n    def num_edges(self) -> int:\n        \"\"\"Return the number of edges in the graph.\"\"\"\n        if self.nonzero is not None:\n            return self.nonzero.sum()\n        else:\n            raise ValueError(\"Empty graph.\")\n\n    @property\n    def data(self) -> Optional[AdjMat]:\n        \"\"\"Return the adjacency matrix.\"\"\"\n        return self._data\n\n    @data.setter\n    def data(self, data: AdjMat):\n        \"\"\"Set adjacency matrix and the corresponding nonzero matrix.\"\"\"\n        self._data = data.astype(float)\n        self._nonzero = np.array(self._data != 0, dtype=bool)\n\n    @property\n    def nonzero(self) -> Optional[AdjNonZeroMat]:\n        \"\"\"Return the nonzero mask for the adjacency matrix.\"\"\"\n        return self._nonzero\n\n    def read_npz(self, path: str, weighted: bool, implicit_ids: bool = False):\n        \"\"\"Read ``.npz`` file and create dense graph.\n\n        Args:\n            path (str): path to ``.npz`` file.\n            weighted (bool): whether the graph is weighted, if unweighted,\n                all none zero weights will be converted to 1.\n            implicit_ids (bool): Implicitly set the node IDs to the canonical\n                ordering from the dense adjacency matrix object. If unset and\n                the `IDs` field is not found in the object, a warning message\n                will be displayed on screen. This warning message can be\n                suppressed if `implicit_ids` is set to True as a confirmation\n                of the behavior.\n\n        \"\"\"\n        raw = np.load(path)\n        self.data = raw[\"data\"]\n        if not weighted:  # overwrite edge weights with constant\n            self.data = self.nonzero * 1.0  # type: ignore\n\n        self.set_node_ids(\n            raw.get(\"IDs\"),\n            implicit_ids=implicit_ids,\n            num_nodes=self.data.shape[0],\n        )\n\n    def read_edg(\n        self,\n        path: str,\n        weighted: bool,\n        directed: bool,\n        delimiter: str = \"\\t\",\n    ):\n        \"\"\"Read an edgelist file and construct dense graph.\"\"\"\n        g = AdjlstGraph()\n        g.read(path, weighted, directed, delimiter)\n\n        self.set_node_ids(g.nodes)\n        self.data = g.to_dense()\n\n    def save(self, path: str):\n        \"\"\"Save dense graph  as ``.dense.npz`` file.\"\"\"\n        np.savez(path, data=self.data, IDs=self.nodes)\n\n    @classmethod\n    def from_adjlst_graph(cls, adjlst_graph, **kwargs):\n        \"\"\"Construct dense graph from adjacency list graph.\n\n        Args:\n            adjlst_graph (:obj:`pecanpy.graph.AdjlstGraph`): Adjacency list\n                graph to be converted.\n\n        \"\"\"\n        g = cls(**kwargs)\n        g.set_node_ids(adjlst_graph.nodes)\n        g.data = adjlst_graph.to_dense()\n        return g\n\n    @classmethod\n    def from_mat(cls, adj_mat: AdjMat, node_ids: List[str], **kwargs):\n        \"\"\"Construct dense graph using adjacency matrix and node IDs.\n\n        Args:\n            adj_mat(NDArray): 2D numpy array of adjacency matrix\n            node_ids(:obj:`list` of str): node ID list\n\n        \"\"\"\n        g = cls(**kwargs)\n        g.data = adj_mat\n        g.set_node_ids(node_ids)\n        return g\n"
  },
  {
    "path": "src/pecanpy/pecanpy.py",
    "content": "\"\"\"Different strategies for generating node2vec walks.\"\"\"\nimport numpy as np\nfrom gensim.models import Word2Vec\nfrom numba import njit\nfrom numba import prange\nfrom numba_progress import ProgressBar\n\nfrom .graph import BaseGraph\nfrom .rw import DenseRWGraph\nfrom .rw import SparseRWGraph\nfrom .typing import Embeddings\nfrom .typing import Float32Array\nfrom .typing import HasNbrs\nfrom .typing import List\nfrom .typing import MoveForward\nfrom .typing import Optional\nfrom .typing import Uint32Array\nfrom .typing import Uint64Array\nfrom .wrappers import Timer\n\ntry:\n    from numba.np.ufunc.parallel import get_thread_id\nexcept ImportError:  # numba<0.56\n    from numba.np.ufunc.parallel import _get_thread_id as get_thread_id\n\n\nclass Base(BaseGraph):\n    \"\"\"Base node2vec object.\n\n    This base object provides the skeleton for the node2vec walk algorithm,\n    which consists of the ``simulate_walks`` method that generate node2vec\n    random walks. In contrast to the original Python implementation of\n    node2vec, it is parallelized where each process generates walks\n    independently.\n\n    Args:\n        p (float): return parameter, value less than 1 encourages returning\n            back to previous vertex, and discourage for value grater than 1\n            (default: 1).\n        q (float): in-out parameter, value less than 1 encourages walks to\n            go \"outward\", and value greater than 1 encourage walking within\n            a localized neighborhood (default: 1)\n        workers (int): number of threads to be spawned for running node2vec\n            including walk generation and word2vec embedding (default: 1)\n        verbose (bool): show progress bar for walk generation.\n        extend (bool): use node2vec+ extension if set to :obj:`True`\n            (default: :obj:`False`).\n        gamma (float): Multiplication factor for the std term of edge\n            weights added to the average edge weights as the noisy edge\n            threshold, only used by node2vec+ (default: 0)\n        random_state (int, optional): Random seed for generating random\n            walks. Note that to fully ensure reproducibility, use single\n            thread (i.e., workers=1), and potentially need to set the\n            Python environment variable ``PYTHONHASHSEED`` to match the\n            random_state (default: :obj:`None`).\n\n    Note:\n        The ``preprocess_transition_probs`` is required for implementations that\n        precomputes and stores 2nd order transition probabilities.\n\n    Examples:\n        Generate node2vec embeddings\n\n        >>> from pecanpy import pecanpy as node2vec\n        >>>\n        >>> # initialize node2vec object, similarly for SparseOTF and DenseOTF\n        >>> g = node2vec.PreComp(p=0.5, q=1, workers=4, verbose=True)\n        >>> # alternatively, can specify ``extend=True`` for using node2vec+\n        >>>\n        >>> # load graph from edgelist file\n        >>> g.read_edg(path_to_edg_file, weighted=True, directed=False)\n        >>> # precompute and save 2nd order transition probs (for PreComp only)\n        >>> g.preprocess_transition_probs()\n        >>>\n        >>> # generate random walks, which could then be used to train w2v\n        >>> walks = g.simulate_walks(num_walks=10, walk_length=80)\n        >>>\n        >>> # alternatively, generate the embeddings directly using ``embed``\n        >>> emd = g.embed()\n\n    \"\"\"\n\n    def __init__(\n        self,\n        p: float = 1,\n        q: float = 1,\n        workers: int = 1,\n        verbose: bool = False,\n        extend: bool = False,\n        gamma: float = 0,\n        random_state: Optional[int] = None,\n    ):\n        super().__init__()\n        self.p = p\n        self.q = q\n        self.workers = workers  # TODO: not doing anything, need to fix.\n        self.verbose = verbose\n        self.extend = extend\n        self.gamma = gamma\n        self.random_state = random_state\n        self._preprocessed: bool = False\n\n    def _map_walk(self, walk_idx_ary: Uint32Array) -> List[str]:\n        \"\"\"Map walk from node index to node ID.\n\n        Note:\n            The last element in the ``walk_idx_ary`` encodes the effective walk\n            length. Only walk indices up to the effective walk length are\n            translated (mapped to node IDs).\n\n        \"\"\"\n        end_idx = walk_idx_ary[-1]\n        walk = [self.nodes[i] for i in walk_idx_ary[:end_idx]]\n        return walk\n\n    def simulate_walks(\n        self,\n        num_walks: int,\n        walk_length: int,\n    ) -> List[List[str]]:\n        \"\"\"Generate walks starting from each nodes ``num_walks`` time.\n\n        Note:\n            This is the master process that spawns worker processes, where the\n            worker function ``node2vec_walks`` genearte a single random walk\n            starting from a vertex of the graph.\n\n        Args:\n            num_walks (int): number of walks starting from each node.\n            walks_length (int): length of walk.\n\n        \"\"\"\n        self._preprocess_transition_probs()\n\n        nodes = np.array(range(self.num_nodes), dtype=np.uint32)\n        start_node_idx_ary = np.concatenate([nodes] * num_walks)\n        tot_num_jobs = start_node_idx_ary.size\n\n        random_state = self.random_state\n        np.random.seed(random_state)\n        np.random.shuffle(start_node_idx_ary)  # for balanced work load\n\n        move_forward = self.get_move_forward()\n        has_nbrs = self.get_has_nbrs()\n        verbose = self.verbose\n\n        # Acquire numba progress proxy for displaying the progress bar\n        with ProgressBar(total=tot_num_jobs, disable=not verbose) as progress:\n            walk_idx_mat = self._random_walks(\n                tot_num_jobs,\n                walk_length,\n                random_state,\n                start_node_idx_ary,\n                has_nbrs,\n                move_forward,\n                progress,\n            )\n\n        # Map node index back to node ID\n        walks = [self._map_walk(walk_idx_ary) for walk_idx_ary in walk_idx_mat]\n\n        return walks\n\n    @staticmethod\n    @njit(parallel=True, nogil=True)\n    def _random_walks(\n        tot_num_jobs: int,\n        walk_length: int,\n        random_state: Optional[int],\n        start_node_idx_ary: Uint32Array,\n        has_nbrs: HasNbrs,\n        move_forward: MoveForward,\n        progress_proxy: ProgressBar,\n    ) -> Uint32Array:\n        \"\"\"Simulate a random walk starting from start node.\"\"\"\n        # Seed the random number generator\n        if random_state is not None:\n            np.random.seed(random_state + get_thread_id())\n\n        # use the last entry of each walk index array to keep track of the\n        # effective walk length\n        walk_idx_mat: Uint32Array = np.zeros(\n            (tot_num_jobs, walk_length + 2),\n            dtype=np.uint32,\n        )\n        walk_idx_mat[:, 0] = start_node_idx_ary  # initialize seeds\n        walk_idx_mat[:, -1] = walk_length + 1  # set to full walk length by default\n\n        for i in prange(tot_num_jobs):\n            # initialize first step as normal random walk\n            start_node_idx = walk_idx_mat[i, 0]\n            if has_nbrs(start_node_idx):\n                walk_idx_mat[i, 1] = move_forward(start_node_idx)\n            else:\n                walk_idx_mat[i, -1] = 1\n                continue\n\n            # start bias random walk\n            for j in range(2, walk_length + 1):\n                cur_idx = walk_idx_mat[i, j - 1]\n                if has_nbrs(cur_idx):\n                    prev_idx = walk_idx_mat[i, j - 2]\n                    walk_idx_mat[i, j] = move_forward(cur_idx, prev_idx)\n                else:\n                    walk_idx_mat[i, -1] = j\n                    break\n\n            progress_proxy.update(1)\n\n        return walk_idx_mat\n\n    def setup_get_normalized_probs(self):\n        \"\"\"Transition probability computation setup.\n\n        This function performs necessary preprocessing of computing the\n        average edge weights array, which is used later by the transition\n        probability computation function ``get_extended_normalized_probs``,\n        if node2vec+ is used. Otherwise, returns the normal transition function\n        ``get_noramlized_probs`` with a trivial placeholder for average edge\n        weights array ``noise_thresholds``.\n\n        \"\"\"\n        if self.extend:  # use n2v+\n            get_normalized_probs = self.get_extended_normalized_probs\n            noise_thresholds = self.get_noise_thresholds()\n        else:  # use normal n2v\n            get_normalized_probs = self.get_normalized_probs\n            noise_thresholds = None\n        return get_normalized_probs, noise_thresholds\n\n    def preprocess_transition_probs(self):\n        \"\"\"Null default preprocess method.\"\"\"\n        pass\n\n    def _preprocess_transition_probs(self):\n        if not self._preprocessed:\n            self.preprocess_transition_probs()\n            self._preprocessed = True\n\n    def embed(\n        self,\n        dim: int = 128,\n        num_walks: int = 10,\n        walk_length: int = 80,\n        window_size: int = 10,\n        epochs: int = 1,\n        verbose: bool = False,\n    ) -> Embeddings:\n        \"\"\"Generate embeddings.\n\n        This is a shortcut function that combines ``simulate_walks`` with\n        ``Word2Vec`` to generate the node2vec embedding.\n\n        Note:\n            The resulting embeddings are aligned with the graph, i.e., the\n            index of embeddings is the same as that for the graph.\n\n        Args:\n            dim (int): dimension of the final embedding, default is 128\n            num_walks (int): number of random walks generated using each node\n                as the seed node, default is 10\n            walk_length (int): length of the random walks, default is 80\n            window_size (int): context window sized for training the\n                ``Word2Vec`` model, default is 10\n            epochs (int): number of epochs for training ``Word2Vec``, default\n                is 1\n            verbose (bool): print time usage for random walk generation and\n                skip-gram training if set to True\n\n        Return:\n            Embeddings: The embedding matrix, each row is a node embedding\n                vector. The index is the same as that for the graph.\n\n        \"\"\"\n        timed_walk = Timer(\"generate walks\", verbose)(self.simulate_walks)\n        timed_w2v = Timer(\"train embeddings\", verbose)(Word2Vec)\n\n        walks = timed_walk(num_walks, walk_length)\n        w2v = timed_w2v(\n            walks,\n            vector_size=dim,\n            window=window_size,\n            sg=1,\n            min_count=0,\n            workers=self.workers,\n            epochs=epochs,\n            seed=self.random_state,\n        )\n\n        return w2v.wv[self.nodes]\n\n\nclass FirstOrderUnweighted(Base, SparseRWGraph):\n    \"\"\"Directly sample edges for first order random walks.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        Base.__init__(self, *args, **kwargs)\n\n    def get_move_forward(self):\n        \"\"\"Wrap ``move_forward``.\"\"\"\n        indices = self.indices\n        indptr = self.indptr\n\n        @njit(nogil=True)\n        def move_forward(cur_idx, prev_idx=None):\n            start, end = indptr[cur_idx], indptr[cur_idx + 1]\n            return indices[np.random.randint(start, end)]\n\n        return move_forward\n\n\nclass PreCompFirstOrder(Base, SparseRWGraph):\n    \"\"\"Precompute transition probabilities for first order random walks.\"\"\"\n\n    def __init__(self, *args, **kwargs):\n        Base.__init__(self, *args, **kwargs)\n        self.alias_j = self.alias_q = None\n\n    def get_move_forward(self):\n        \"\"\"Wrap ``move_forward``.\"\"\"\n        indices = self.indices\n        indptr = self.indptr\n\n        alias_j = self.alias_j\n        alias_q = self.alias_q\n\n        @njit(nogil=True)\n        def move_forward(cur_idx, prev_idx=None):\n            start, end = indptr[cur_idx], indptr[cur_idx + 1]\n            choice = alias_draw(alias_j[start:end], alias_q[start:end])\n\n            return indices[indptr[cur_idx] + choice]\n\n        return move_forward\n\n    def preprocess_transition_probs(self):\n        \"\"\"Precompute and store first order transition probabilities.\"\"\"\n        data = self.data\n        indices = self.indices\n        indptr = self.indptr\n\n        # Retrieve transition probability computation callback function\n        get_normalized_probs = self.get_normalized_probs_first_order\n\n        # Determine the dimensionality of the 1st order transition probs\n        n_nodes = indptr.size - 1  # number of nodes\n        n_probs = indptr[-1]  # total number of 1st order transition probs\n\n        @njit(parallel=True, nogil=True)\n        def compute_all_transition_probs():\n            alias_j = np.zeros(n_probs, dtype=np.uint32)\n            alias_q = np.zeros(n_probs, dtype=np.float32)\n\n            for idx in range(n_nodes):\n                start, end = indptr[idx], indptr[idx + 1]\n                probs = get_normalized_probs(data, indices, indptr, idx)\n                alias_j[start:end], alias_q[start:end] = alias_setup(probs)\n\n            return alias_j, alias_q\n\n        self.alias_j, self.alias_q = compute_all_transition_probs()\n\n\nclass PreComp(Base, SparseRWGraph):\n    \"\"\"Precompute transition probabilities.\n\n    This implementation precomputes and stores 2nd order transition\n    probabilities first and uses read off transition probabilities during the\n    process of random walk. The graph type used is ``SparseRWGraph``.\n\n    Note:\n        Need to call ``preprocess_transition_probs()`` first before generating\n        walks.\n\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        Base.__init__(self, *args, **kwargs)\n        self.alias_dim: Optional[Uint32Array] = None\n        self.alias_j: Optional[Uint32Array] = None\n        self.alias_q: Optional[Float32Array] = None\n        self.alias_indptr: Optional[Uint64Array] = None\n\n    def get_move_forward(self):\n        \"\"\"Wrap ``move_forward``.\n\n        This function returns a ``numba.njit`` compiled function that takes\n        current vertex index (and the previous vertex index if available) and\n        returns the next vertex index by sampling from a discrete random\n        distribution based on the transition probabilities that are read off\n        the precomputed transition probabilities table.\n\n        Note:\n            The returned function is used by the ``simulate_walks`` method.\n\n        \"\"\"\n        data = self.data\n        indices = self.indices\n        indptr = self.indptr\n        p = self.p\n        q = self.q\n        get_normalized_probs = self.get_normalized_probs\n\n        alias_j = self.alias_j\n        alias_q = self.alias_q\n        alias_indptr = self.alias_indptr\n        alias_dim = self.alias_dim\n\n        @njit(nogil=True)\n        def move_forward(cur_idx, prev_idx=None):\n            \"\"\"Move to next node based on transition probabilities.\"\"\"\n            if prev_idx is None:\n                normalized_probs = get_normalized_probs(\n                    data,\n                    indices,\n                    indptr,\n                    p,\n                    q,\n                    cur_idx,\n                    None,\n                    None,\n                )\n                cdf = np.cumsum(normalized_probs)\n                choice = np.searchsorted(cdf, np.random.random())\n            else:\n                # Find index of neighbor (previous node) for reading alias\n                start = indptr[cur_idx]\n                end = indptr[cur_idx + 1]\n                nbr_idx = np.searchsorted(indices[start:end], prev_idx)\n                if indices[start + nbr_idx] != prev_idx:\n                    print(\"FATAL ERROR! Neighbor not found.\")\n\n                dim = alias_dim[cur_idx]\n                start = alias_indptr[cur_idx] + dim * nbr_idx\n                end = start + dim\n                choice = alias_draw(alias_j[start:end], alias_q[start:end])\n\n            return indices[indptr[cur_idx] + choice]\n\n        return move_forward\n\n    def preprocess_transition_probs(self):\n        \"\"\"Precompute and store 2nd order transition probabilities.\n\n        Each node contains n ** 2 number of 2nd order transition probabilities,\n        where n is the number of neighbors of that specific node, since one\n        can pick any one of its neighbors as the previous node and / or the\n        next node. For each second order transition probability of a node, set\n        up the alias draw table to be used during random walk.\n\n        Note:\n            Uses uint64 instead of uint32 for tracking alias_indptr to prevent\n            overflowing since the 2nd order transition probs grows much faster\n            than the first order transition probs, which is the same as the\n            total number of edges in the graph.\n\n        \"\"\"\n        data = self.data\n        indices = self.indices\n        indptr = self.indptr\n        p = self.p\n        q = self.q\n\n        # Retrieve transition probability computation callback function\n        get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs()\n\n        # Determine the dimensionality of the 2nd order transition probs\n        n_nodes = self.indptr.size - 1  # number of nodes\n        n = self.indptr[1:] - self.indptr[:-1]  # number of nbrs per node\n        n2 = np.power(n, 2)  # number of 2nd order trans probs per node\n\n        # Set the dimensionality of alias probability table\n        self.alias_dim = alias_dim = n\n        self.alias_indptr = alias_indptr = np.zeros(self.indptr.size, dtype=np.uint64)\n        alias_indptr[1:] = np.cumsum(n2)\n        n_probs = alias_indptr[-1]  # total number of 2nd order transition probs\n\n        @njit(parallel=True, nogil=True)\n        def compute_all_transition_probs():\n            alias_j = np.zeros(n_probs, dtype=np.uint32)\n            alias_q = np.zeros(n_probs, dtype=np.float32)\n\n            for idx in range(n_nodes):\n                offset = alias_indptr[idx]\n                dim = alias_dim[idx]\n\n                nbrs = indices[indptr[idx] : indptr[idx + 1]]\n                for nbr_idx in prange(n[idx]):\n                    nbr = nbrs[nbr_idx]\n                    probs = get_normalized_probs(\n                        data,\n                        indices,\n                        indptr,\n                        p,\n                        q,\n                        idx,\n                        nbr,\n                        noise_thresholds,\n                    )\n\n                    start = offset + dim * nbr_idx\n                    end = start + dim\n                    alias_j[start:end], alias_q[start:end] = alias_setup(probs)\n\n            return alias_j, alias_q\n\n        self.alias_j, self.alias_q = compute_all_transition_probs()\n\n\nclass SparseOTF(Base, SparseRWGraph):\n    \"\"\"Sparse graph transition on the fly.\n\n    This implementation does *NOT* precompute transition probabilities in advance\n    but instead calculates them on-the-fly during the process of random walk.\n    The graph type used is ``SparseRWGraph``.\n\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        Base.__init__(self, *args, **kwargs)\n\n    def get_move_forward(self):\n        \"\"\"Wrap ``move_forward``.\n\n        This function returns a ``numba.njit`` compiled function that takes\n        current vertex index (and the previous vertex index if available) and\n        returns the next vertex index by sampling from a discrete random\n        distribution based on the transition probabilities that are calculated\n        on-the-fly.\n\n        Note:\n            The returned function is used by the ``simulate_walks`` method.\n\n        \"\"\"\n        data = self.data\n        indices = self.indices\n        indptr = self.indptr\n        p = self.p\n        q = self.q\n\n        get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs()\n\n        @njit(nogil=True)\n        def move_forward(cur_idx, prev_idx=None):\n            \"\"\"Move to next node.\"\"\"\n            normalized_probs = get_normalized_probs(\n                data,\n                indices,\n                indptr,\n                p,\n                q,\n                cur_idx,\n                prev_idx,\n                noise_thresholds,\n            )\n            cdf = np.cumsum(normalized_probs)\n            choice = np.searchsorted(cdf, np.random.random())\n\n            return indices[indptr[cur_idx] + choice]\n\n        return move_forward\n\n\nclass DenseOTF(Base, DenseRWGraph):\n    \"\"\"Dense graph transition on the fly.\n\n    This implementation does *NOT* precompute transition probabilities in advance\n    but instead calculates them on-the-fly during the process of random walk.\n    The graph type used is ``DenseRWGraph``.\n\n    \"\"\"\n\n    def __init__(self, *args, **kwargs):\n        Base.__init__(self, *args, **kwargs)\n\n    def get_move_forward(self):\n        \"\"\"Wrap ``move_forward``.\n\n        This function returns a ``numba.njit`` compiled function that takes\n        current vertex index (and the previous vertex index if available) and\n        returns the next vertex index by sampling from a discrete random\n        distribution based on the transition probabilities that are calculated\n        on-the-fly.\n\n        Note:\n            The returned function is used by the ``simulate_walks`` method.\n\n        \"\"\"\n        data = self.data\n        nonzero = self.nonzero\n        p = self.p\n        q = self.q\n\n        get_normalized_probs, noise_thresholds = self.setup_get_normalized_probs()\n\n        @njit(nogil=True)\n        def move_forward(cur_idx, prev_idx=None):\n            \"\"\"Move to next node.\"\"\"\n            normalized_probs = get_normalized_probs(\n                data,\n                nonzero,\n                p,\n                q,\n                cur_idx,\n                prev_idx,\n                noise_thresholds,\n            )\n            cdf = np.cumsum(normalized_probs)\n            choice = np.searchsorted(cdf, np.random.random())\n            nbrs = np.where(nonzero[cur_idx])[0]\n\n            return nbrs[choice]\n\n        return move_forward\n\n\n@njit(nogil=True)\ndef alias_setup(probs):\n    \"\"\"Construct alias lookup table.\n\n    This code is modified from the blog post here:\n    https://lips.cs.princeton.edu/the-alias-method-efficient-sampling-with-many-discrete-outcomes/\n    , where you can find more details about how the method works. In general,\n    the alias method improves the time complexity of sampling from a discrete\n    random distribution to O(1) if the alias table is setup in advance.\n\n    Args:\n        probs (list(float32)): normalized transition probabilities array, could\n            be in either list or NDArray, of float32 values.\n\n    \"\"\"\n    k = probs.size\n    q = np.zeros(k, dtype=np.float32)\n    j = np.zeros(k, dtype=np.uint32)\n\n    smaller = np.zeros(k, dtype=np.uint32)\n    larger = np.zeros(k, dtype=np.uint32)\n    smaller_ptr = 0\n    larger_ptr = 0\n\n    for kk in range(k):\n        q[kk] = k * probs[kk]\n        if q[kk] < 1.0:\n            smaller[smaller_ptr] = kk\n            smaller_ptr += 1\n        else:\n            larger[larger_ptr] = kk\n            larger_ptr += 1\n\n    while (smaller_ptr > 0) & (larger_ptr > 0):\n        smaller_ptr -= 1\n        small = smaller[smaller_ptr]\n        larger_ptr -= 1\n        large = larger[larger_ptr]\n\n        j[small] = large\n        q[large] = q[large] + q[small] - 1.0\n        if q[large] < 1.0:\n            smaller[smaller_ptr] = large\n            smaller_ptr += 1\n        else:\n            larger[larger_ptr] = large\n            larger_ptr += 1\n\n    return j, q\n\n\n@njit(nogil=True)\ndef alias_draw(j, q):\n    \"\"\"Draw sample from a non-uniform discrete distribution using alias sampling.\"\"\"\n    k = j.size\n\n    kk = np.random.randint(k)\n    if np.random.rand() < q[kk]:\n        return kk\n    else:\n        return j[kk]\n"
  },
  {
    "path": "src/pecanpy/rw/__init__.py",
    "content": "\"\"\"Graph objects equipped with random walk transition functions.\"\"\"\nfrom .dense_rw import DenseRWGraph\nfrom .sparse_rw import SparseRWGraph\n\n__all__ = [\"DenseRWGraph\", \"SparseRWGraph\"]\n"
  },
  {
    "path": "src/pecanpy/rw/dense_rw.py",
    "content": "\"\"\"Dense Graph object equipped with random walk computation.\"\"\"\nimport numpy as np\nfrom numba import njit\n\nfrom ..graph import DenseGraph\n\n\nclass DenseRWGraph(DenseGraph):\n    \"\"\"Dense Graph object equipped with random walk computation.\"\"\"\n\n    def get_noise_thresholds(self):\n        \"\"\"Compute average edge weights.\"\"\"\n        noise_threshold_ary = np.zeros(self.num_nodes, dtype=np.float32)\n        for i in range(self.num_nodes):\n            weights = self.data[i, self.nonzero[i]]\n            noise_threshold_ary[i] = weights.mean() + self.gamma * weights.std()\n        noise_threshold_ary = np.maximum(noise_threshold_ary, 0)\n\n        return noise_threshold_ary\n\n    def get_has_nbrs(self):\n        \"\"\"Wrap ``has_nbrs``.\"\"\"\n        nonzero = self.nonzero\n\n        @njit(nogil=True)\n        def has_nbrs(idx):\n            for j in range(nonzero.shape[1]):\n                if nonzero[idx, j]:\n                    return True\n            return False\n\n        return has_nbrs\n\n    @staticmethod\n    @njit(nogil=True)\n    def get_normalized_probs(\n        data,\n        nonzero,\n        p,\n        q,\n        cur_idx,\n        prev_idx,\n        noise_threshold_ary,\n    ):\n        \"\"\"Calculate node2vec transition probabilities.\n\n        Calculate 2nd order transition probabilities by first finding the\n        neighbors of the current state that are not reachable from the previous\n        state, and divide the corresponding edge weights by the in-out parameter\n        ``q``. Then divide the edge weight from previous state by the return\n        parameter ``p``. Finally, the transition probabilities are computed by\n        normalizing the biased edge weights.\n\n        Note:\n            If ``prev_idx`` present, calculate 2nd order biased transition,\n        otherwise calculate 1st order transition.\n\n        \"\"\"\n        nbrs_ind = nonzero[cur_idx]\n        unnormalized_probs = data[cur_idx].copy()\n\n        if prev_idx is not None:  # 2nd order biased walks\n            non_com_nbr = np.logical_and(nbrs_ind, ~nonzero[prev_idx])\n            non_com_nbr[prev_idx] = False  # exclude previous state from out biases\n\n            unnormalized_probs[non_com_nbr] /= q  # apply out biases\n            unnormalized_probs[prev_idx] /= p  # apply the return bias\n\n        unnormalized_probs = unnormalized_probs[nbrs_ind]\n        normalized_probs = unnormalized_probs / unnormalized_probs.sum()\n\n        return normalized_probs\n\n    @staticmethod\n    @njit(nogil=True)\n    def get_extended_normalized_probs(\n        data,\n        nonzero,\n        p,\n        q,\n        cur_idx,\n        prev_idx,\n        noise_threshold_ary,\n    ):\n        \"\"\"Calculate node2vec+ transition probabilities.\"\"\"\n        cur_nbrs_ind = nonzero[cur_idx]\n        unnormalized_probs = data[cur_idx].copy()\n\n        if prev_idx is not None:  # 2nd order biased walks\n            prev_nbrs_weight = data[prev_idx].copy()\n\n            # Note: we assume here the network is undirected, hence the edge\n            # weight connecting the next to prev is the same as the reverse.\n            out_ind = cur_nbrs_ind & (prev_nbrs_weight < noise_threshold_ary)\n            out_ind[prev_idx] = False  # exclude previous state from out biases\n\n            # print(\"CURRENT: \", cur_idx)\n            # print(\"INOUT: \", np.where(out_ind)[0])\n            # print(\"NUM INOUT: \", out_ind.sum(), \"\\n\")\n\n            t = prev_nbrs_weight[out_ind] / noise_threshold_ary[out_ind]\n            # optional nonlinear parameterization\n            # b = 1; t = b * t / (1 - (b - 1) * t)\n\n            # compute out biases\n            alpha = 1 / q + (1 - 1 / q) * t\n\n            # suppress noisy edges\n            alpha[\n                unnormalized_probs[out_ind] < noise_threshold_ary[cur_idx]\n            ] = np.minimum(1, 1 / q)\n            unnormalized_probs[out_ind] *= alpha  # apply out biases\n            unnormalized_probs[prev_idx] /= p  # apply the return bias\n\n        unnormalized_probs = unnormalized_probs[cur_nbrs_ind]\n        normalized_probs = unnormalized_probs / unnormalized_probs.sum()\n\n        return normalized_probs\n"
  },
  {
    "path": "src/pecanpy/rw/sparse_rw.py",
    "content": "\"\"\"Sparse Graph equipped with random walk computation.\"\"\"\nimport numpy as np\nfrom numba import boolean\nfrom numba import njit\n\nfrom ..graph import SparseGraph\n\n\nclass SparseRWGraph(SparseGraph):\n    \"\"\"Sparse Graph equipped with random walk computation.\"\"\"\n\n    def get_has_nbrs(self):\n        \"\"\"Wrap ``has_nbrs``.\"\"\"\n        indptr = self.indptr\n\n        @njit(nogil=True)\n        def has_nbrs(idx):\n            return indptr[idx] != indptr[idx + 1]\n\n        return has_nbrs\n\n    def get_noise_thresholds(self):\n        \"\"\"Compute average edge weights.\"\"\"\n        data = self.data\n        indptr = self.indptr\n\n        noise_threshold_ary = np.zeros(self.num_nodes, dtype=np.float32)\n        for i in range(self.num_nodes):\n            noise_threshold_ary[i] = (\n                data[indptr[i] : indptr[i + 1]].mean()\n                + self.gamma * data[indptr[i] : indptr[i + 1]].std()\n            )\n        noise_threshold_ary = np.maximum(noise_threshold_ary, 0)\n\n        return noise_threshold_ary\n\n    @staticmethod\n    @njit(nogil=True)\n    def get_normalized_probs_first_order(data, indices, indptr, cur_idx):\n        \"\"\"Calculate first order transition probabilities.\n\n        Note:\n            This function does NOT check whether p = q = 1, which is the\n            required setup for first order random walk. Need to check before\n            calling this function.\n\n        \"\"\"\n        _, unnormalized_probs = get_nbrs(indptr, indices, data, cur_idx)\n        return unnormalized_probs / unnormalized_probs.sum()\n\n    @staticmethod\n    @njit(nogil=True)\n    def get_normalized_probs(\n        data,\n        indices,\n        indptr,\n        p,\n        q,\n        cur_idx,\n        prev_idx,\n        noise_threshold_ary,\n    ):\n        \"\"\"Calculate node2vec transition probabilities.\n\n        Calculate 2nd order transition probabilities by first finding the\n        neighbors of the current state that are not reachable from the previous\n        state, and divide the corresponding edge weights by the in-out parameter\n        ``q``. Then divide the edge weight from previous state by the return\n        parameter ``p``. Finally, the transition probabilities are computed by\n        normalizing the biased edge weights.\n\n        Note:\n            If ``prev_idx`` present, calculate 2nd order biased transition,\n        otherwise calculate 1st order transition.\n\n        \"\"\"\n        nbrs_idx, unnormalized_probs = get_nbrs(indptr, indices, data, cur_idx)\n        if prev_idx is not None:  # 2nd order biased walk\n            prev_ptr = np.where(nbrs_idx == prev_idx)[0]\n            src_nbrs_idx, src_nbrs_wts = get_nbrs(indptr, indices, data, prev_idx)\n\n            # Neighbors of current but not previous\n            non_com_nbr = isnotin(nbrs_idx, src_nbrs_idx)\n            non_com_nbr[prev_ptr] = False  # exclude prev state from out biases\n\n            unnormalized_probs[non_com_nbr] /= q  # apply out biases\n            unnormalized_probs[prev_ptr] /= p  # apply the return bias\n\n        normalized_probs = unnormalized_probs / unnormalized_probs.sum()\n\n        return normalized_probs\n\n    @staticmethod\n    @njit(nogil=True)\n    def get_extended_normalized_probs(\n        data,\n        indices,\n        indptr,\n        p,\n        q,\n        cur_idx,\n        prev_idx,\n        noise_threshold_ary,\n    ):\n        \"\"\"Calculate node2vec+ transition probabilities.\"\"\"\n        nbrs_idx, unnormalized_probs = get_nbrs(indptr, indices, data, cur_idx)\n        if prev_idx is not None:  # 2nd order biased walk\n            prev_ptr = np.where(nbrs_idx == prev_idx)[0]\n            src_nbrs_idx, src_nbrs_wts = get_nbrs(indptr, indices, data, prev_idx)\n            out_ind, t = isnotin_extended(\n                nbrs_idx,\n                src_nbrs_idx,\n                src_nbrs_wts,\n                noise_threshold_ary,\n            )  # determine out edges\n            out_ind[prev_ptr] = False  # exclude prevstate from out biases\n\n            # compute out biases\n            alpha = 1 / q + (1 - 1 / q) * t[out_ind]\n\n            # suppress noisy edges\n            alpha[\n                unnormalized_probs[out_ind] < noise_threshold_ary[cur_idx]\n            ] = np.minimum(1, 1 / q)\n            unnormalized_probs[out_ind] *= alpha  # apply out biases\n            unnormalized_probs[prev_ptr] /= p  # apply the return bias\n\n        normalized_probs = unnormalized_probs / unnormalized_probs.sum()\n\n        return normalized_probs\n\n\n@njit(nogil=True)\ndef get_nbrs(indptr, indices, data, idx):\n    \"\"\"Return neighbor indices and weights of a specific node index.\"\"\"\n    start_idx, end_idx = indptr[idx], indptr[idx + 1]\n    nbrs_idx = indices[start_idx:end_idx]\n    nbrs_wts = data[start_idx:end_idx].copy()\n    return nbrs_idx, nbrs_wts\n\n\n@njit(nogil=True)\ndef isnotin(ptr_ary1, ptr_ary2):\n    \"\"\"Find node2vec out edges.\n\n    The node2vec out edges are determined by non-common neighbors. This function\n    finds out neighbors of node1 that are not neighbors of node2, by picking out\n    values in ``ptr_ary1`` but not in ``ptr_ary2``, which correspond to the\n    neighbor pointers for the current state and the previous state, resp.\n\n    Note:\n        This function does not remove the index of the previous state. Instead,\n    the index of the previous state will be removed once the indicator is\n    returned to the ``get_normalized_probs``.\n\n    Args:\n        ptr_ary1 (Uint32Array): array of pointers to\n            the neighbors of the current state\n        ptr_ary2 (Uint32Array): array of pointers to\n            the neighbors of the previous state\n\n    Returns:\n        Indicator of whether a neighbor of the current state is considered as\n            an \"out edge\"\n\n    Example:\n        The values in the two neighbor pointer arrays are sorted ascendingly.\n        The main idea is to scan through ``ptr_ary1`` and compare the values in\n        ``ptr_ary2``. In this way, at most one pass per array is needed to find\n        out the non-common neighbor pointers instead of a nested loop (for each\n        element in ``ptr_ary1``, compare against every element in``ptr_ary2``),\n        which is much slower. Checkout the following example for more intuition.\n        The ``*`` above ``ptr_ary1`` and ``ptr_ary2`` indicate the indices\n        ``idx1`` and ``idx2``, respectively, which keep track of the scanning\n        progress.\n\n        >>> ptr_ary1 = [1, 2, 5]\n        >>> ptr_ary2 = [1, 5]\n        >>>\n        >>> # iteration1: indicator = [False, True, True]\n        >>>  *\n        >>> [1, 2, 5]\n        >>>  *\n        >>> [1, 5]\n        >>>\n        >>> # iteration2: indicator = [False, True, True]\n        >>>     *\n        >>> [1, 2, 5]\n        >>>     *\n        >>> [1, 5]\n        >>>\n        >>> # iteration3: indicator = [False, True, False]\n        >>>        *\n        >>> [1, 2, 5]\n        >>>     *\n        >>> [1, 5]\n        >>>\n        >>> # end of loop\n\n    \"\"\"\n    indicator = np.ones(ptr_ary1.size, dtype=boolean)\n    idx2 = 0\n    for idx1 in range(ptr_ary1.size):\n        if idx2 == ptr_ary2.size:  # end of ary2\n            break\n\n        ptr1 = ptr_ary1[idx1]\n        ptr2 = ptr_ary2[idx2]\n\n        if ptr1 < ptr2:\n            continue\n\n        elif ptr1 == ptr2:  # found a matching value\n            indicator[idx1] = False\n            idx2 += 1\n\n        elif ptr1 > ptr2:\n            # sweep through ptr_ary2 until ptr2 catch up on ptr1\n            for j in range(idx2, ptr_ary2.size):\n                ptr2 = ptr_ary2[j]\n                if ptr2 == ptr1:\n                    indicator[idx1] = False\n                    idx2 = j + 1\n                    break\n\n                elif ptr2 > ptr1:\n                    idx2 = j\n                    break\n\n    return indicator\n\n\n@njit(nogil=True)\ndef isnotin_extended(ptr_ary1, ptr_ary2, wts_ary2, noise_thresholds):\n    \"\"\"Find node2vec+ out edges.\n\n    The node2vec+ out edges are determined by considering the edge weights\n    connecting node2 (the potential next state) to the previous state. Unlike\n    node2vec, which only considers neighbors of current state that are not\n    neighbors of the previous state, node2vec+ also considers neighbors of\n    the previous state as out edges if the edge weight is below average.\n\n    Args:\n        ptr_ary1 (Uint32Array): array of pointers to the neighbors of the\n            current state\n        ptr_ary2 (Uint32Array): array of pointers to the neighbors of the\n            previous state\n        wts_ary2 (Float32Array): array of edge weights of the previous state\n        noise_thresholds (Float32Array): array of noisy edge threshold computed\n            based on the average and the std of the edge weights of each node\n\n    Return:\n        Indicator of whether a neighbor of the current state is considered as\n            an \"out edge\", with the corresponding parameters used to fine tune\n            the out biases\n\n    \"\"\"\n    indicator = np.ones(ptr_ary1.size, dtype=boolean)\n    t = np.zeros(ptr_ary1.size, dtype=np.float32)\n    idx2 = 0\n    for idx1 in range(ptr_ary1.size):\n        if idx2 >= ptr_ary2.size:  # end of ary2\n            break\n\n        ptr1 = ptr_ary1[idx1]\n        ptr2 = ptr_ary2[idx2]\n\n        if ptr1 < ptr2:\n            continue\n\n        elif ptr1 == ptr2:  # found a matching value\n            # If connection is not loose, identify as an in-edge\n            if wts_ary2[idx2] >= noise_thresholds[ptr2]:\n                indicator[idx1] = False\n            else:\n                t[idx1] = wts_ary2[idx2] / noise_thresholds[ptr2]\n            idx2 += 1\n\n        elif ptr1 > ptr2:\n            # Sweep through ptr_ary2 until ptr2 catch up on ptr1\n            for j in range(idx2 + 1, ptr_ary2.size):\n                ptr2 = ptr_ary2[j]\n                if ptr2 == ptr1:\n                    if wts_ary2[j] >= noise_thresholds[ptr2]:\n                        indicator[idx1] = False\n                    else:\n                        t[idx1] = wts_ary2[j] / noise_thresholds[ptr2]\n                    idx2 = j + 1\n                    break\n\n                elif ptr2 > ptr1:\n                    idx2 = j\n                    break\n\n    return indicator, t\n"
  },
  {
    "path": "src/pecanpy/typing.py",
    "content": "\"\"\"Type annotations.\"\"\"\nfrom typing import Any\nfrom typing import Callable\nfrom typing import Dict\nfrom typing import Iterator\nfrom typing import List\nfrom typing import Optional\nfrom typing import Sequence\nfrom typing import Tuple\n\nfrom nptyping import Bool\nfrom nptyping import Float32\nfrom nptyping import NDArray\nfrom nptyping import Shape\nfrom nptyping import UInt32\nfrom nptyping import UInt64\nfrom typing_extensions import TypeAlias\n\n# Callbacks ###################################################################\nHasNbrs = Callable[[UInt32], bool]\nMoveForward = Callable[..., UInt32]\n\n# Numpy array types ###########################################################\n# issue with type alias (https://stackoverflow.com/questions/62073473)\nEmbeddings: TypeAlias = NDArray[Shape[\"*, *\"], Float32]\nAdjMat: TypeAlias = NDArray[Shape[\"*, *\"], Any]\nAdjNonZeroMat: TypeAlias = NDArray[Shape[\"*, *\"], Bool]\nUint32Array: TypeAlias = NDArray[Shape[\"*\"], UInt32]\nUint64Array: TypeAlias = NDArray[Shape[\"*\"], UInt64]\nFloat32Array: TypeAlias = NDArray[Shape[\"*\"], Float32]\nCSR = Tuple[Uint32Array, Uint32Array, Float32Array]\n\n__all__ = [\n    \"AdjMat\",\n    \"AdjNonZeroMat\",\n    \"Any\",\n    \"CSR\",\n    \"Callable\",\n    \"Dict\",\n    \"Embeddings\",\n    \"Float32Array\",\n    \"HasNbrs\",\n    \"Iterator\",\n    \"List\",\n    \"MoveForward\",\n    \"NDArray\",\n    \"Optional\",\n    \"Sequence\",\n    \"Tuple\",\n    \"Uint32Array\",\n]\n"
  },
  {
    "path": "src/pecanpy/wrappers.py",
    "content": "\"\"\"Wrappers used by pecanpy.\"\"\"\nimport time\n\n\nclass Timer:\n    \"\"\"Timer for logging runtime of function.\"\"\"\n\n    def __init__(self, name, verbose=True):\n        self.name = name\n        self.verbose = verbose\n\n    def __call__(self, func):\n        \"\"\"Call timer decorator.\"\"\"\n\n        def wrapper(*args, **kwargs):\n            start = time.time()\n            result = func(*args, **kwargs)\n            duration = time.time() - start\n\n            hrs = int(duration // 3600)\n            mins = int(duration % 3600 // 60)\n            secs = duration % 60\n            print(f\"Took {hrs:02d}:{mins:02d}:{secs:05.2f} to {self.name}\")\n\n            return result\n\n        return wrapper if self.verbose else func\n"
  },
  {
    "path": "test/test_cli.py",
    "content": "import os\nimport os.path as osp\nimport shutil\nimport subprocess\nimport tempfile\nimport unittest\nfrom unittest.mock import patch\n\nfrom numba import set_num_threads\nfrom parameterized import parameterized\nfrom pecanpy import cli\n\nset_num_threads(1)\n\nDATA_DIR = osp.abspath(osp.join(__file__, osp.pardir, osp.pardir, \"demo\"))\nEDG_FP = osp.join(DATA_DIR, \"karate.edg\")\n\nTMP_DATA_DIR = tempfile.mkdtemp()\nCSR_FP = osp.join(TMP_DATA_DIR, \"karate.csr.npz\")\nDENSE_FP = osp.join(TMP_DATA_DIR, \"karate.dense.npz\")\nCOM = [\"pecanpy\", \"--input\", EDG_FP, \"--output\"]\n\nSETTINGS = [\n    (\"FirstOrderUnweighted\",),\n    (\"PreCompFirstOrder\",),\n    (\"PreComp\",),\n    (\"SparseOTF\",),\n    (\"DenseOTF\",),\n]\n\n\nclass TestCli(unittest.TestCase):\n    @classmethod\n    def setUpClass(cls):\n        subprocess.run(COM + [CSR_FP, \"--task\", \"tocsr\"])\n        subprocess.run(COM + [DENSE_FP, \"--task\", \"todense\"])\n\n    @classmethod\n    def tearDownClass(cls):\n        shutil.rmtree(TMP_DATA_DIR)\n\n    @patch(\n        \"argparse._sys.argv\",\n        [\"pecanpy\", \"--input\", \"\", \"--output\", os.devnull],\n    )\n    def setUp(self):\n        self.args = cli.parse_args()\n        self.args.workers = 1\n        self.args.dimensions = 8\n        self.args.walk_length = 10\n        self.args.num_walks = 2\n        self.g = self.walks = None\n\n    def tearDown(self):\n        del self.args\n        del self.g\n        del self.walks\n\n    def execute(self, mode, input_file, p=1, q=1):\n        self.args.mode = mode\n        self.args.input = input_file\n        self.args.p = p\n        self.args.q = q\n        self.g = cli.read_graph(self.args)\n        cli.preprocess(self.g)\n        self.walks = cli.simulate_walks(self.args, self.g)\n        cli.learn_embeddings(self.args, self.walks)\n\n    def test_firstorderunweighted_catch(self):\n        for p, q in (2, 1), (1, 0.1), (0.1, 0.1):\n            with self.subTest(p=p, q=q):\n                with self.assertRaises(ValueError):\n                    self.execute(\"FirstOrderUnweighted\", EDG_FP, p, q)\n\n    def test_precompfirstorder_catch(self):\n        for p, q in (2, 1), (1, 0.1), (0.1, 0.1):\n            with self.subTest(p=p, q=q):\n                with self.assertRaises(ValueError):\n                    self.execute(\"PreCompFirstOrder\", EDG_FP, p, q)\n\n    @parameterized.expand(SETTINGS)\n    def test_from_edg(self, name):\n        self.execute(name, EDG_FP)\n\n    @parameterized.expand(SETTINGS)\n    def test_from_npz(self, name):\n        self.execute(name, DENSE_FP if name == \"DenseOTF\" else CSR_FP)\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "test/test_graph.py",
    "content": "import os\nimport os.path as osp\nimport shutil\nimport tempfile\nimport unittest\nfrom itertools import chain\n\nimport numpy as np\nimport pytest\nimport scipy.sparse\nfrom pecanpy.graph import AdjlstGraph\nfrom pecanpy.graph import BaseGraph\nfrom pecanpy.graph import DenseGraph\nfrom pecanpy.graph import SparseGraph\n\nMAT = np.array(\n    [\n        [0, 1, 1],\n        [1, 0, 0],\n        [1, 0, 0],\n    ],\n    dtype=float,\n)\nINDPTR = np.array([0, 2, 3, 4], dtype=np.uint32)\nINDICES = np.array([1, 2, 0, 0], dtype=np.uint32)\nDATA = np.array([1.0, 1.0, 1.0, 1.0], dtype=np.float32)\nADJLST = [\n    {1: 1.0, 2: 1.0},\n    {0: 1.0},\n    {0: 1.0},\n]\nIDS = [\"a\", \"b\", \"c\"]\nIDMAP = {\"a\": 0, \"b\": 1, \"c\": 2}\n\n# This test ensures that the node IDs (from edges) are loaded in the correct order\n# even if they appear to have been loaded in an incorrect order.\nMAT2 = np.array(\n    [\n        [0, 1, 0, 0, 0],\n        [1, 0, 1, 1, 0],\n        [0, 1, 0, 0, 0],\n        [0, 1, 0, 0, 1],\n        [0, 0, 0, 1, 0],\n    ],\n    dtype=float,\n)\nINDPTR2 = np.array([0, 1, 4, 5, 7, 8], dtype=np.uint32)\nINDICES2 = np.array([1, 0, 2, 3, 1, 1, 4, 3], dtype=np.uint32)\nDATA2 = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)\nADJLST2 = [\n    {1: 1.0},\n    {0: 1.0, 2: 1.0, 3: 1.0},\n    {1: 1.0},\n    {1: 1.0, 4: 1.0},\n    {3: 1.0},\n]\nIDS2 = [\"a\", \"b\", \"c\", \"d\", \"e\"]\nIDMAP2 = {\"a\": 0, \"b\": 1, \"c\": 2, \"d\": 3, \"e\": 4}\n\n# Test asymmetric directed graph loading with node that has no out-going edge\nMAT3 = np.array(\n    [\n        [0, 1, 0, 0],\n        [1, 0, 0, 1],\n        [0, 0, 0, 0],\n        [0, 1, 1, 0],\n    ],\n)\nINDPTR3 = np.array([0, 1, 3, 3, 5], dtype=np.uint32)\nINDICES3 = np.array([1, 0, 3, 1, 2], dtype=np.uint32)\nDATA3 = np.array([1.0, 1.0, 1.0, 1.0, 1.0], dtype=np.float32)\nADJLST3 = [\n    {1: 1.0},\n    {0: 1.0, 3: 1.0},\n    {},\n    {1: 1.0, 2: 1.0},\n]\nIDS3 = [\"a\", \"b\", \"c\", \"d\"]\nIDMAP3 = {\"a\": 0, \"b\": 1, \"c\": 2, \"d\": 3}\n\n\nclass TestBaseGraph(unittest.TestCase):\n    def setUp(self):\n        self.g = BaseGraph()\n        self.g.set_node_ids(IDS)\n\n    def test_set_node_ids(self):\n        self.assertEqual(self.g.nodes, IDS)\n        self.assertEqual(self.g._node_idmap, IDMAP)\n\n    def test_properties(self):\n        self.assertEqual(self.g.num_nodes, 3)\n        with self.assertRaises(NotImplementedError):\n            self.assertEqual(self.g.num_edges, 4)\n        with self.assertRaises(NotImplementedError):\n            self.assertEqual(self.g.density, 2 / 3)\n\n\nclass TestAdjlstGraph(unittest.TestCase):\n    def setUp(self):\n        self.g1 = AdjlstGraph.from_mat(MAT, IDS)\n        self.g2 = AdjlstGraph.from_mat(MAT2, IDS2)\n        self.g3 = AdjlstGraph.from_mat(MAT3, IDS3)\n\n    def tearDown(self):\n        del self.g1\n        del self.g2\n        del self.g3\n\n    def test_from_mat(self):\n        self.assertEqual(self.g1._data, ADJLST)\n        self.assertEqual(self.g1.nodes, IDS)\n\n        self.assertEqual(self.g2._data, ADJLST2)\n        self.assertEqual(self.g2.nodes, IDS2)\n\n        self.assertEqual(self.g3._data, ADJLST3)\n        self.assertEqual(self.g3.nodes, IDS3)\n\n    def test_properties(self):\n        self.assertEqual(self.g1.num_nodes, 3)\n        self.assertEqual(self.g1.num_edges, 4)\n        self.assertEqual(self.g1.density, 2 / 3)\n\n        self.assertEqual(self.g2.num_nodes, 5)\n        self.assertEqual(self.g2.num_edges, 8)\n        self.assertEqual(self.g2.density, 2 / 5)\n\n        self.assertEqual(self.g3.num_nodes, 4)\n        self.assertEqual(self.g3.num_edges, 5)\n        self.assertEqual(self.g3.density, 5 / 12)\n\n    def test_edges(self):\n        self.assertEqual(\n            list(self.g1.edges),\n            [\n                (0, 1, 1),\n                (0, 2, 1),\n                (1, 0, 1),\n                (2, 0, 1),\n            ],\n        )\n\n        self.assertEqual(\n            list(self.g2.edges),\n            [\n                (0, 1, 1),\n                (1, 0, 1),\n                (1, 2, 1),\n                (1, 3, 1),\n                (2, 1, 1),\n                (3, 1, 1),\n                (3, 4, 1),\n                (4, 3, 1),\n            ],\n        )\n\n    def test_save(self):\n        expected_results = {\n            (False, \"\\t\"): [\n                \"a\\tb\\t1.0\\n\",\n                \"a\\tc\\t1.0\\n\",\n                \"b\\ta\\t1.0\\n\",\n                \"c\\ta\\t1.0\\n\",\n            ],\n            (True, \"\\t\"): [\n                \"a\\tb\\n\",\n                \"a\\tc\\n\",\n                \"b\\ta\\n\",\n                \"c\\ta\\n\",\n            ],\n            (False, \",\"): [\n                \"a,b,1.0\\n\",\n                \"a,c,1.0\\n\",\n                \"b,a,1.0\\n\",\n                \"c,a,1.0\\n\",\n            ],\n            (True, \",\"): [\n                \"a,b\\n\",\n                \"a,c\\n\",\n                \"b,a\\n\",\n                \"c,a\\n\",\n            ],\n        }\n\n        tmpdir = tempfile.mkdtemp()\n        tmppath = os.path.join(tmpdir, \"test.edg\")\n\n        for unweighted in True, False:\n            for delimiter in [\"\\t\", \",\"]:\n                self.g1.save(\n                    tmppath,\n                    unweighted=unweighted,\n                    delimiter=delimiter,\n                )\n\n                with open(tmppath) as f:\n                    expected_result = expected_results[(unweighted, delimiter)]\n                    for line, expected_line in zip(f, expected_result):\n                        self.assertEqual(line, expected_line)\n\n        shutil.rmtree(tmpdir)\n\n\nclass TestSparseGraph(unittest.TestCase):\n    def tearDown(self):\n        del self.g1\n        del self.g2\n        del self.g3\n\n    def validate(self):\n        self.assertTrue(np.all(self.g1.indptr == INDPTR))\n        self.assertTrue(np.all(self.g1.indices == INDICES))\n        self.assertTrue(np.all(self.g1.data == DATA))\n        self.assertEqual(self.g1.nodes, IDS)\n        self.assertEqual(self.g1.num_nodes, 3)\n        self.assertEqual(self.g1.num_edges, 4)\n        self.assertEqual(self.g1.density, 2 / 3)\n\n        self.assertTrue(np.all(self.g2.indptr == INDPTR2))\n        self.assertTrue(np.all(self.g2.indices == INDICES2))\n        self.assertTrue(np.all(self.g2.data == DATA2))\n        self.assertEqual(self.g2.nodes, IDS2)\n        self.assertEqual(self.g2.num_nodes, 5)\n        self.assertEqual(self.g2.num_edges, 8)\n        self.assertEqual(self.g2.density, 2 / 5)\n\n        self.assertTrue(np.all(self.g3.indptr == INDPTR3))\n        self.assertTrue(np.all(self.g3.indices == INDICES3))\n        self.assertTrue(np.all(self.g3.data == DATA3))\n        self.assertEqual(self.g3.nodes, IDS3)\n        self.assertEqual(self.g3.num_nodes, 4)\n        self.assertEqual(self.g3.num_edges, 5)\n        self.assertEqual(self.g3.density, 5 / 12)\n\n    def test_from_mat(self):\n        self.g1 = SparseGraph.from_mat(MAT, IDS)\n        self.g2 = SparseGraph.from_mat(MAT2, IDS2)\n        self.g3 = SparseGraph.from_mat(MAT3, IDS3)\n        self.validate()\n\n    def test_from_adjlst_graph(self):\n        self.g1 = SparseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT, IDS))\n        self.g2 = SparseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT2, IDS2))\n        self.g3 = SparseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT3, IDS3))\n        self.validate()\n\n\nclass TestDenseGraph(unittest.TestCase):\n    def tearDown(self):\n        del self.g1\n        del self.g2\n\n    def validate(self):\n        self.assertTrue(np.all(self.g1.data == MAT))\n        self.assertEqual(self.g1.nodes, IDS)\n        self.assertEqual(self.g1.num_nodes, 3)\n        self.assertEqual(self.g1.num_edges, 4)\n        self.assertEqual(self.g1.density, 2 / 3)\n\n        self.assertTrue(np.all(self.g2.data == MAT2))\n        self.assertEqual(self.g2.nodes, IDS2)\n        self.assertEqual(self.g2.num_nodes, 5)\n        self.assertEqual(self.g2.num_edges, 8)\n        self.assertEqual(self.g2.density, 2 / 5)\n\n        self.assertTrue(np.all(self.g3.data == MAT3))\n        self.assertEqual(self.g3.nodes, IDS3)\n        self.assertEqual(self.g3.num_nodes, 4)\n        self.assertEqual(self.g3.num_edges, 5)\n        self.assertEqual(self.g3.density, 5 / 12)\n\n    def test_from_mat(self):\n        self.g1 = DenseGraph.from_mat(MAT, IDS)\n        self.g2 = DenseGraph.from_mat(MAT2, IDS2)\n        self.g3 = DenseGraph.from_mat(MAT3, IDS3)\n        self.validate()\n\n    def test_from_adjlst_graph(self):\n        self.g1 = DenseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT, IDS))\n        self.g2 = DenseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT2, IDS2))\n        self.g3 = DenseGraph.from_adjlst_graph(AdjlstGraph.from_mat(MAT3, IDS3))\n        self.validate()\n\n\n@pytest.mark.usefixtures(\"karate_graph_converted\")\ndef test_csr_from_scipy(tmpdir):\n    tmp_karate_csr_path = osp.join(tmpdir, \"karate.csr.npz\")\n    print(f\"Temporary karate CSR will be saved under {tmp_karate_csr_path}\")\n\n    # Save karate CSR using scipy.sparse.csr\n    edgelist = np.loadtxt(pytest.KARATE_ORIG_PATH).astype(int) - 1\n    edgelist = np.vstack((edgelist, edgelist[:, [1, 0]])).T  # to undirected\n    num_nodes = edgelist.max() + 1\n    csr = scipy.sparse.csr_matrix(\n        (np.ones(edgelist.shape[1]), ([edgelist[0], edgelist[1]])),\n        shape=(num_nodes, num_nodes),\n    )\n    scipy.sparse.save_npz(tmp_karate_csr_path, csr)\n\n    # Load scipy CSR and compare with PecanPy CSR\n    scipy_csr_graph, pecanpy_graph = SparseGraph(), AdjlstGraph()\n    scipy_csr_graph.read_npz(tmp_karate_csr_path, weighted=False)\n    pecanpy_graph.read(pytest.KARATE_ORIG_PATH, weighted=False, directed=False)\n\n    # Assert graph size (number of nodes)\n    assert scipy_csr_graph.num_nodes == pecanpy_graph.num_nodes\n\n    # Assert neighborhood sizes\n    scipy_csr_nbhd_sizes = scipy_csr_graph.indptr[1:] - scipy_csr_graph.indptr[:-1]\n    for scipy_node_idx in range(scipy_csr_graph.num_nodes):\n        pecanpy_node_idx = pecanpy_graph.get_node_idx(str(scipy_node_idx + 1))\n        assert scipy_csr_nbhd_sizes[scipy_node_idx] == len(\n            pecanpy_graph._data[pecanpy_node_idx],\n        )\n\n\n@pytest.mark.usefixtures(\"karate_graph_converted\")\n@pytest.mark.parametrize(\"implicit_ids\", [True, False])\n@pytest.mark.parametrize(\"graph_factory\", [SparseGraph, DenseGraph])\ndef test_implicit_ids(implicit_ids, graph_factory):\n    graph_path = (\n        pytest.KARATE_CSR_PATH\n        if graph_factory == SparseGraph\n        else pytest.KARATE_DENSE_PATH\n    )\n    ref_ids = pytest.KARATE_IMPLICIT_IDS if implicit_ids else pytest.KARATE_NODE_IDS\n\n    g = graph_factory()\n    g.read_npz(graph_path, weighted=False, implicit_ids=implicit_ids)\n\n    assert sorted(g.nodes) == sorted(ref_ids)\n\n\n@pytest.fixture(scope=\"module\")\ndef karate_graph_converted(pytestconfig, tmpdir_factory):\n    tmpdir = tmpdir_factory.mktemp(\"test_graph\")\n    pytest.KARATE_ORIG_PATH = osp.join(pytestconfig.rootpath, \"demo/karate.edg\")\n    pytest.KARATE_CSR_PATH = osp.join(tmpdir, \"karate.csr.npz\")\n    pytest.KARATE_DENSE_PATH = osp.join(tmpdir, \"karate.dense.npz\")\n\n    # Load karate node ids\n    karate_edgelist = np.loadtxt(pytest.KARATE_ORIG_PATH, dtype=str).tolist()\n    pytest.KARATE_NODE_IDS = list(set(chain.from_iterable(karate_edgelist)))\n    pytest.KARATE_IMPLICIT_IDS = list(map(str, range(len(pytest.KARATE_NODE_IDS))))\n\n    # Load karate graph and save csr.npz and dense.npz\n    g = AdjlstGraph()\n    g.read(pytest.KARATE_ORIG_PATH, weighted=False, directed=False)\n    SparseGraph.from_adjlst_graph(g).save(pytest.KARATE_CSR_PATH)\n    DenseGraph.from_adjlst_graph(g).save(pytest.KARATE_DENSE_PATH)\n    del g\n\n    yield\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "test/test_pecanpy.py",
    "content": "import os.path as osp\nimport unittest\n\nfrom numba import set_num_threads\nfrom parameterized import parameterized\nfrom pecanpy import graph\nfrom pecanpy import pecanpy\n\nset_num_threads(1)\n\nDATA_DIR = osp.abspath(osp.join(__file__, osp.pardir, osp.pardir, \"demo\"))\nEDG_FP = osp.join(DATA_DIR, \"karate.edg\")\nSETTINGS = [\n    (\"SparseOTF\", pecanpy.SparseOTF),\n    (\"DenseOTF\", pecanpy.DenseOTF),\n    (\"PreComp\", pecanpy.PreComp),\n    (\"PreCompFirstOrder\", pecanpy.PreCompFirstOrder),\n    (\"FirstOrderUnweighted\", pecanpy.FirstOrderUnweighted),\n]\n\n\nclass TestPecanPy(unittest.TestCase):\n    @classmethod\n    def setUpClass(self):\n        g = graph.DenseGraph()\n        g.read_edg(EDG_FP, weighted=False, directed=False)\n        self.mat = g.data\n        self.ids = g.nodes\n\n    @parameterized.expand(SETTINGS)\n    def test_from_mat(self, name, mode):\n        with self.subTest(name):\n            g = mode.from_mat(self.mat, self.ids, p=1, q=1)\n            g.embed()\n\n    @parameterized.expand(SETTINGS)\n    def test_from_edg(self, name, mode):\n        with self.subTest(name):\n            g = mode(p=1, q=1)\n            g.read_edg(EDG_FP, weighted=False, directed=False)\n            g.embed()\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "test/test_walk.py",
    "content": "import unittest\n\nimport numpy as np\nfrom numba import set_num_threads\nfrom parameterized import parameterized\nfrom pecanpy import pecanpy\n\nset_num_threads(1)\n\nMAT = np.array(\n    [\n        [0, 1, 0, 0, 0],\n        [1, 0, 1, 0, 0],\n        [0, 1, 0, 1, 1],\n        [0, 0, 1, 0, 1],\n        [0, 0, 1, 1, 0],\n    ],\n)\nIDS = [\"a\", \"b\", \"c\", \"d\", \"e\"]\n\nWALKS = {\n    \"FirstOrderUnweighted\": [\n        [\"c\", \"b\", \"c\", \"d\"],\n        [\"d\", \"c\", \"d\", \"e\"],\n        [\"e\", \"d\", \"c\", \"b\"],\n        [\"e\", \"d\", \"c\", \"b\"],\n        [\"b\", \"a\", \"b\", \"a\"],\n        [\"b\", \"a\", \"b\", \"c\"],\n        [\"c\", \"e\", \"d\", \"e\"],\n        [\"d\", \"c\", \"b\", \"c\"],\n        [\"a\", \"b\", \"c\", \"d\"],\n        [\"a\", \"b\", \"c\", \"b\"],\n    ],\n    \"PreCompFirstOrder\": [\n        [\"c\", \"d\", \"e\", \"d\"],\n        [\"d\", \"c\", \"d\", \"e\"],\n        [\"e\", \"d\", \"c\", \"e\"],\n        [\"e\", \"d\", \"e\", \"c\"],\n        [\"b\", \"c\", \"e\", \"c\"],\n        [\"b\", \"c\", \"d\", \"c\"],\n        [\"c\", \"d\", \"e\", \"d\"],\n        [\"d\", \"c\", \"e\", \"d\"],\n        [\"a\", \"b\", \"a\", \"b\"],\n        [\"a\", \"b\", \"c\", \"e\"],\n    ],\n    \"PreComp\": [\n        [\"c\", \"d\", \"e\", \"d\"],\n        [\"d\", \"c\", \"d\", \"e\"],\n        [\"e\", \"d\", \"c\", \"e\"],\n        [\"e\", \"d\", \"e\", \"c\"],\n        [\"b\", \"c\", \"e\", \"c\"],\n        [\"b\", \"c\", \"d\", \"c\"],\n        [\"c\", \"d\", \"e\", \"d\"],\n        [\"d\", \"c\", \"e\", \"d\"],\n        [\"a\", \"b\", \"a\", \"b\"],\n        [\"a\", \"b\", \"c\", \"e\"],\n    ],\n    \"SparseOTF\": [\n        [\"c\", \"d\", \"e\", \"d\"],\n        [\"d\", \"e\", \"c\", \"d\"],\n        [\"e\", \"c\", \"e\", \"d\"],\n        [\"e\", \"c\", \"e\", \"d\"],\n        [\"b\", \"c\", \"e\", \"c\"],\n        [\"b\", \"a\", \"b\", \"c\"],\n        [\"c\", \"e\", \"d\", \"e\"],\n        [\"d\", \"e\", \"c\", \"e\"],\n        [\"a\", \"b\", \"c\", \"b\"],\n        [\"a\", \"b\", \"c\", \"d\"],\n    ],\n    \"DenseOTF\": [\n        [\"c\", \"d\", \"e\", \"d\"],\n        [\"d\", \"e\", \"c\", \"d\"],\n        [\"e\", \"c\", \"e\", \"d\"],\n        [\"e\", \"c\", \"e\", \"d\"],\n        [\"b\", \"c\", \"e\", \"c\"],\n        [\"b\", \"a\", \"b\", \"c\"],\n        [\"c\", \"e\", \"d\", \"e\"],\n        [\"d\", \"e\", \"c\", \"e\"],\n        [\"a\", \"b\", \"c\", \"b\"],\n        [\"a\", \"b\", \"c\", \"d\"],\n    ],\n}\n\n\nclass TestWalk(unittest.TestCase):\n    @parameterized.expand(\n        [\n            (\"FirstOrderUnweighted\", pecanpy.FirstOrderUnweighted),\n            (\"PreCompFirstOrder\", pecanpy.PreComp),\n            (\"PreComp\", pecanpy.PreComp),\n            (\"SparseOTF\", pecanpy.SparseOTF),\n            (\"DenseOTF\", pecanpy.DenseOTF),\n        ],\n    )\n    def test_first_order_unweighted(self, name, mode):\n        graph = mode.from_mat(MAT, IDS, p=1, q=1, random_state=0)\n        walks = graph.simulate_walks(2, 3)\n        self.assertEqual(walks, WALKS[name])\n        print(walks)\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n"
  },
  {
    "path": "tox.ini",
    "content": "[tox]\nminversion = 3.8.0\nenvlist =\n    python3.8\n    python3.9\n    python3.10\n    python3.11\n    flake8\n    mypy\nisolated_build = true\n\n[gh-actions]\npython =\n    3.8: python3.8, flake8\n    3.9: python3.9\n    3.10: python3.10\n    3.11: python3.11\n\n[testenv]\nsetenv =\n    PYTHONPATH = {toxinidir}\ndeps =\n    -r{toxinidir}/requirements.txt\n    .[dev]\ncommands =\n    pytest --basetemp={envtmpdir} test/\n\n[testenv:mypy]\nskip_install = true\ndeps =\n    mypy\n    numpy\ncommands = mypy src/pecanpy\n\n[testenv:flake8]\nskip_install = true\ndeps =\n    flake8\n    # flake8-bandit\n    flake8-builtins\n    flake8-bugbear\n    flake8-colors\n    flake8-commas\n    flake8-comprehensions\n    flake8-docstrings\n    flake8-import-order\n    flake8-use-fstring\n    pep8-naming\n    pydocstyle\ncommands =\n    flake8 src/pecanpy/\ndescription = Run the flake8 tool with several plugins (bandit, docstrings, import order, pep8 naming).\n\n[flake8]\nmax-line-length = 88\nextend-ignore =\n    A005\n    E203\n    # current limitation of nptyping https://github.com/ramonhagenaars/nptyping/issues/63\n    F722\n    # init param docstring in class docstring\n    D107\nexclude =\n    .tox,\n    .git,\n    __pycache__,\n    build,\n    dist,\n    *.pyc,\n    *.egg-info,\n    .cache,\n    .eggs\nimport-order-style = pycharm\napplication-import-names =\n    pybel\n    bel_resources\n    tests\n"
  }
]