Repository: zyxue/ncbitax2lin
Branch: master
Commit: 3f97a126721d
Files: 26
Total size: 31.4 KB

Directory structure:
gitextract_ve7hzziz/

├── .github/
│   └── workflows/
│       └── python-package.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE.txt
├── Makefile
├── README.md
├── mypy.ini
├── ncbitax2lin/
│   ├── __init__.py
│   ├── data_io.py
│   ├── fmt.py
│   ├── lineage.py
│   ├── ncbitax2lin.py
│   ├── struct.py
│   └── utils.py
├── pylintrc
├── pyproject.toml
├── tests/
│   ├── __init__.py
│   ├── test___init__.py
│   ├── test_data/
│   │   ├── names.head_20.dmp
│   │   └── nodes.head_20.dmp
│   ├── test_data_io.py
│   ├── test_fmt.py
│   ├── test_lineage.py
│   ├── test_ncbitax2lin.py
│   └── test_utils.py
└── tox.ini

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/python-package.yml
================================================
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: Python package

on:
  push:
    branches: [ master ]
  pull_request:
    branches: [ master ]

jobs:
  build:

    runs-on: ubuntu-22.04
    strategy:
      matrix:
        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]

    steps:
    - uses: actions/checkout@v2
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v2
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip poetry==1.8.5
        poetry install
    - name: Lint
      run: |
        make lint
    - name: Test
      run: |
        make test


================================================
FILE: .gitignore
================================================
.coverage
.vscode/
.mypy_cache/
.python-version
__pycache__/
dist
htmlcov/
ncbitax2lin.egg-info/
build


================================================
FILE: CHANGELOG.md
================================================
## Change Log

### v3.0.0 (2025/09/23)

- Fixed https://github.com/zyxue/ncbitax2lin/issues/31
- Upgraded dependencies and support py39 to py313 instead.

### v2.3.0 (2022/03/20)

- Supports Python-3.9

### v2.2.0 (2022/03/20)

- Fixed bug related to sharing global variables among multiple processes. (#14, #15)

### v2.0.2 (2020/05/02)

- Made pylint and mypy pass.

### v2.0.1 (2020/05/02)

- Adopted [poetry](https://python-poetry.org/) for package management.
- Modernized the code (Python-3.7, typing, and some tests).

### v1.1 (2017/03/17)

- Remove hosting converted lineages.csv.gz from the repo.
- Converted lineages will be versioned and hosted elsewhere.

### v1.0 (2016/04/24)

- Organized the code into a release.


================================================
FILE: LICENSE.txt
================================================
MIT License

Copyright (c) 2017 zyxue

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: Makefile
================================================
SRC_DIR=ncbitax2lin
TESTS_DIR=tests

# https://www.gnu.org/software/make/manual/html_node/Force-Targets.html
FORCE:

format: FORCE
	poetry run autoflake --recursive --in-place --remove-all-unused-imports $(SRC_DIR) $(TESTS_DIR) \
	&& poetry run black $(SRC_DIR) $(TESTS_DIR) \
	&& poetry run isort $(SRC_DIR) $(TESTS_DIR) \

black: FORCE
	poetry run black --check $(SRC_DIR) $(TESTS_DIR)

isort: FORCE
	poetry run isort --check $(SRC_DIR) $(TESTS_DIR)

mypy: FORCE
	poetry run mypy $(SRC_DIR) $(TESTS_DIR)

pylint: FORCE
	poetry run pylint $(SRC_DIR) $(TESTS_DIR)

test: FORCE
	PYTHONHASHSEED=1 \
	&& poetry run coverage run --source=$(SRC_DIR) --module pytest --durations=10 --failed-first $(1) \
	&& poetry run coverage report --show-missing \
	&& poetry run coverage html

lint: black isort mypy pylint

all: lint test


================================================
FILE: README.md
================================================
# NCBItax2lin

[![Downloads](https://pepy.tech/badge/ncbitax2lin/week)](https://pepy.tech/project/ncbitax2lin)

Convert NCBI taxonomy dump into lineages. An example for [human
(tax_id=9606)](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606)
is like

| tax_id | superkingdom | phylum   | class    | order    | family    | genus | species      | family1 | forma | genus1 | infraclass | infraorder  | kingdom | no rank            | no rank1     | no rank10            | no rank11 | no rank12 | no rank13 | no rank14 | no rank15     | no rank16 | no rank17 | no rank18 | no rank19 | no rank2  | no rank20 | no rank21 | no rank22 | no rank3  | no rank4      | no rank5   | no rank6      | no rank7   | no rank8     | no rank9      | parvorder  | species group | species subgroup | species1 | subclass | subfamily | subgenus | subkingdom | suborder    | subphylum | subspecies | subtribe | superclass | superfamily | superorder       | superorder1 | superphylum | tribe | varietas |
|--------|--------------|----------|----------|----------|-----------|-------|--------------|---------|-------|--------|------------|-------------|---------|--------------------|--------------|----------------------|-----------|-----------|-----------|-----------|---------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|---------------|------------|---------------|------------|--------------|---------------|------------|---------------|------------------|----------|----------|-----------|----------|------------|-------------|-----------|------------|----------|------------|-------------|------------------|-------------|-------------|-------|----------|
| 9606   | Eukaryota    | Chordata | Mammalia | Primates | Hominidae | Homo  | Homo sapiens |         |       |        |            | Simiiformes | Metazoa | cellular organisms | Opisthokonta | Dipnotetrapodomorpha | Tetrapoda | Amniota   | Theria    | Eutheria  | Boreoeutheria |           |           |           |           | Eumetazoa |           |           |           | Bilateria | Deuterostomia | Vertebrata | Gnathostomata | Teleostomi | Euteleostomi | Sarcopterygii | Catarrhini |               |                  |          |          | Homininae |          |            | Haplorrhini | Craniata  |            |          |            | Hominoidea  | Euarchontoglires |             |             |       |          |

### Install

ncbitax2lin supports python-3.9 to python-3.13.

```
pip install -U ncbitax2lin
```

It is also available in Conda on the Bioconda channel:

```
conda install bioconda::ncbitax2lin
```

### Generate lineages

First download taxonomy dump from NCBI:

```bash
wget -N ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
mkdir -p taxdump && tar zxf taxdump.tar.gz -C ./taxdump
```

Then, run ncbitax2lin

```bash
ncbitax2lin --nodes-file taxdump/nodes.dmp --names-file taxdump/names.dmp
```

By default, the generated lineages will be saved to
`ncbi_lineages_[date_of_utcnow].csv.gz`. The output file can be overwritten with
`--output` option.


## FAQ

**Q**: I have a large number of sequences with their corresponding accession
numbers from NCBI, how to get their lineages?

**A**: First, you need to map accession numbers (GI is deprecated) to tax IDs
based on `nucl_*accession2taxid.gz` files from
ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/. Secondly, you can trace a
sequence's whole lineage based on its tax ID. The tax-id-to-lineage mapping is
what NCBItax2lin can generate for you.

If you have any question about this project, please feel free to create a new
[issue](https://github.com/zyxue/ncbitax2lin/issues/new).

## Note on `taxdump.tar.gz.md5`

It appears that NCBI periodically regenerates `taxdump.tar.gz` and
`taxdump.tar.gz.md5` even when its content is still the same. I am not sure how
their regeneration works, but `taxdump.tar.gz.md5` will differ simply because
of a different timestamp.

## Used in

* Mahmoudabadi, G., & Phillips, R. (2018). A comprehensive and quantitative exploration of thousands of viral genomes. ELife, 7. https://doi.org/10.7554/eLife.31955
* Dombrowski, N. et al. (2020) Undinarchaeota illuminate DPANN phylogeny and the impact of gene transfer on archaeal evolution, Nature Communications. Springer US, 11(1). doi: 10.1038/s41467-020-17408-w. https://www.nature.com/articles/s41467-020-17408-w
* Schenberger Santos, A. R. et al. (2020) NAD+ biosynthesis in bacteria is controlled by global carbon/ nitrogen levels via PII signaling, Journal of Biological Chemistry, 295(18), pp. 6165–6176. doi: 10.1074/jbc.RA120.012793. https://www.sciencedirect.com/science/article/pii/S0021925817482433
* Villada, J. C., Duran, M. F. and Lee, P. K. H. (2020) Interplay between Position-Dependent Codon Usage Bias and Hydrogen Bonding at the 5' End of ORFeomes, mSystems, 5(4), pp. 1–18. doi: 10.1128/msystems.00613-20. https://msystems.asm.org/content/5/4/e00613-20
* Byadgi, O. et al. (2020) Transcriptome analysis of amyloodinium ocellatum tomonts revealed basic information on the major potential virulence factors, Genes, 11(11), pp. 1–12. doi: 10.3390/genes11111252. https://www.mdpi.com/2073-4425/11/11/1252
* Cumbo, F., & Blankenberg, D. (2025). Characterization of microbial dark matter at scale with MetaSBT and taxonomy-aware Sequence Bloom Trees. bioRxiv. https://doi.org/10.1101/2025.08.25.672238

## Development

### Install dependencies

```
poetry install --sync
```

### Testing

```
make format
make all
```

### Publish (only for administrator)

```
poetry version [minor/major etc.]
git tag vx.y.z
git push origin vx.y.z
poetry publish --build -u __token__ --password pypi-<token-from-pypi>
```
Update [CHANGELOG.md](/CHANGELOG.md).


================================================
FILE: mypy.ini
================================================
[mypy]
python_version = 3.9
disallow_untyped_defs = True
ignore_missing_imports = True
show_column_numbers = True

================================================
FILE: ncbitax2lin/__init__.py
================================================
"""__init__.py for this project"""

__version__ = "2.4.1"


================================================
FILE: ncbitax2lin/data_io.py
================================================
"""utility functions related to IO"""

import pandas as pd

from ncbitax2lin import utils


def strip(str_: str) -> str:
    """
    :param str_: a string
    """
    return str_.strip()


@utils.timeit
def load_nodes(nodes_file: str) -> pd.DataFrame:
    """
    load nodes.dmp and convert it into a pandas.DataFrame
    """
    df_data = pd.read_csv(
        nodes_file,
        sep="|",
        header=None,
        index_col=False,
        names=[
            "tax_id",
            "parent_tax_id",
            "rank",
            "embl_code",
            "division_id",
            "inherited_div_flag",
            "genetic_code_id",
            "inherited_GC__flag",
            "mitochondrial_genetic_code_id",
            "inherited_MGC_flag",
            "GenBank_hidden_flag",
            "hidden_subtree_root_flag",
            "comments",
        ],
    )

    return df_data.assign(
        rank=lambda df: df["rank"].apply(strip),
        embl_code=lambda df: df["embl_code"].apply(strip),
        comments=lambda df: df["comments"].apply(strip),
    )


@utils.timeit
def load_names(names_file: str) -> pd.DataFrame:
    """
    load names.dmp and convert it into a pandas.DataFrame
    """
    df_data = pd.read_csv(
        names_file,
        sep="|",
        header=None,
        index_col=False,
        names=["tax_id", "name_txt", "unique_name", "name_class"],
    )

    return (
        df_data.assign(
            name_txt=lambda df: df["name_txt"].apply(strip),
            unique_name=lambda df: df["unique_name"].apply(strip),
            name_class=lambda df: df["name_class"].apply(strip),
        )
        .loc[lambda df: df["name_class"] == "scientific name"]
        .reset_index(drop=True)
    )


def read_names_and_nodes(names_file: str, nodes_file: str) -> pd.DataFrame:
    """Reads in data from names and nodes files"""
    # data downloaded from ftp://ftp.ncbi.nih.gov/pub/taxonomy/
    # args = parse_args()
    nodes_df = load_nodes(nodes_file)
    names_df = load_names(names_file)

    return (
        nodes_df.merge(names_df, on="tax_id")[
            ["tax_id", "parent_tax_id", "rank", "name_txt"]
        ]
        .rename(columns={"name_txt": "rank_name"})
        .reset_index(drop=True)
    )


def write_lineages_to_disk(df_lineages: pd.DataFrame, output_path: str) -> None:
    """Gzip lineages and write them to disk"""
    # superkingdom has been renamed to domain in
    # https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/
    domain_col = "domain"

    # For backwards compatibility with older taxdumps.
    if "superkingdom" in df_lineages:
        domain_col = "superkingdom"

    cols = [
        "tax_id",
        domain_col,
        "phylum",
        "class",
        "order",
        "family",
        "genus",
        "species",
    ]
    other_cols = sorted([col for col in df_lineages.columns if col not in cols])
    output_cols = cols + other_cols

    df_lineages.to_csv(
        output_path, index=False, compression="gzip", columns=output_cols
    )


================================================
FILE: ncbitax2lin/fmt.py
================================================
"""Utilities for preparing the lineages for output."""

import concurrent.futures
from typing import Container, Dict, List, Union

import pandas as pd

from ncbitax2lin.struct import Lineage


def _calc_rank_key(rank: str, existing_ranks: Container[str]) -> str:
    """Calcluates a key for the lineage representation in a dictionary.

    Defaults to the rank itself, e.g. no rank, superkingdom, phylum, etc. but
    when a rank appears multiple times (common for "no rank" rank) in a single
    linearge it will be numbered, e.g. no rank1, no rank2, and so on.

    Args:
        rank: e.g. no rank, superkingdom, phylum, etc.
        existing_ranks: rank keys already existing
    """
    # e.g. there could be multiple 'no rank'
    if rank not in existing_ranks:
        return rank

    count = 1
    numbered_rank = f"{rank}{count}"
    while numbered_rank in existing_ranks:
        count += 1
        numbered_rank = f"{rank}{count}"
    return numbered_rank


def _convert_lineage_to_dict(lineage: Lineage) -> Dict[str, Union[int, str]]:
    """Converts the lineage in a list-of-tuples represetantion to a dictionary representation

    [
        ("tax_id1", "rank1", "name_txt1"),
        ("tax_id2", "rank2", "name_txt2"),
        ...
    ]

    becomes

    {
        "rank1": "name_txt1",
        "rank2": "name_txt2",
        "tax_id": "tax_id2",   # using the last rank as the tax_id of this lineage
    }

    A concrete example:

        [
            (131567, 'no rank', 'cellular organisms'),
            (2, 'superkingdom', 'Bacteria')
        ]

    becomes

        {
            'no rank': 'cellular organisms',
            'superkingdom': 'Bacteria',
            'tax_id': 2,
        }

    """
    output: Dict[str, Union[int, str]] = {}
    len_lineage = len(lineage)
    for k, (tax_id, rank, rank_name) in enumerate(lineage):
        # use the last rank of the lineage as the tax_id of the lineage
        if k == len_lineage - 1:
            output["tax_id"] = tax_id

        rank_key = _calc_rank_key(rank, output.keys())
        output[rank_key] = rank_name
    return output


def prepare_lineages_for_output(lineages: List[Lineage]) -> pd.DataFrame:
    """prepares lineages into a dataframe for writing to disk"""

    with concurrent.futures.ProcessPoolExecutor() as executors:
        out = executors.map(_convert_lineage_to_dict, lineages, chunksize=5000)

    df_out = pd.DataFrame(out)

    return df_out.sort_values("tax_id")


================================================
FILE: ncbitax2lin/lineage.py
================================================
"""Utilities for finding lineages."""

import logging
import math
import multiprocessing
import os
import pickle
import tempfile
from typing import Dict, List

from ncbitax2lin import utils
from ncbitax2lin.struct import Lineage, TaxUnit

_LOGGER = logging.getLogger(__name__)

# tax_id of first line in names.dmp: no rank
ROOT_TAX_ID = 1


def _find_one_lineage(tax_id: int, tax_dict: Dict[int, TaxUnit]) -> Lineage:
    """Finds lineage for a single tax id"""
    if tax_id % 50000 == 0:
        # TODO: it's tricky why _LOGGER.info here won't make the log show up.
        # Note, this function is run in a subprocess.
        print(f"working on tax_id: {tax_id}")

    lineage = []
    while True:
        record = tax_dict[tax_id]
        lineage.append((record["tax_id"], record["rank"], record["rank_name"]))
        tax_id = record["parent_tax_id"]

        # every tax can be traced back to tax_id == 1, the root
        if tax_id == ROOT_TAX_ID:
            break

    # reverse results in lineage of Kingdom => species, this is helpful for
    # to_dict when there are multiple "no rank"s
    lineage.reverse()
    return Lineage(lineage)


def _find_lineages(
    tax_ids: List[int], tax_dict: Dict[int, TaxUnit], output: str
) -> None:
    """Finds lineages for a list of tax ids."""

    lineages = []
    for tax_id in tax_ids:
        lineage = _find_one_lineage(tax_id, tax_dict)
        lineages.append(lineage)

    with open(output, "wb") as opened:
        pickle.dump(lineages, opened)


def _calc_num_procs(max_num: int = 6) -> int:
    """Calculates number of the processes to use."""
    return min(multiprocessing.cpu_count(), max_num)


def _calc_chunk_size(num_vals: int, num_chunks: int) -> int:
    """Calculates the chunk size."""
    return math.ceil(num_vals / num_chunks)


def find_all_lineages(
    tax_ids: List[int], tax_dict: Dict[int, TaxUnit]
) -> List[Lineage]:
    """Finds the lineages for all tax ids

    Args:
        tax_id: all tax ids to find lineages for.
        tax_dict: a dictionary of tax_id => tax_unit.
    """
    nprocs = _calc_num_procs()
    _LOGGER.info(
        "will use %d processes to find lineages for all %s tax ids",
        nprocs,
        f"{len(tax_ids):,d}",
    )

    chunk_size = _calc_chunk_size(len(tax_ids), num_chunks=nprocs)
    _LOGGER.info("chunk_size = %d", chunk_size)

    tax_id_chunks = utils.partition(tax_ids, size=chunk_size)
    _LOGGER.info("chunked sizes: %s", [len(_) for _ in tax_id_chunks])

    procs, tmp_outputs, all_lineages = [], [], []

    with tempfile.TemporaryDirectory(suffix="_ncbitax2lin") as tmpdir:
        for index, chunk in enumerate(tax_id_chunks):
            tmp_output = os.path.join(tmpdir, f"_lineages_{index}.pkl")

            tmp_outputs.append(tmp_output)
            proc = multiprocessing.Process(
                target=_find_lineages, args=(chunk, tax_dict, tmp_output)
            )
            procs.append(proc)

        _LOGGER.info("Starting %d processes ...", len(procs))
        for proc in procs:
            proc.start()

        _LOGGER.info("Joining %d processes ...", len(procs))
        for proc in procs:
            proc.join()

        for tmp_output in tmp_outputs:
            _LOGGER.info("adding lineages from %s ...", tmp_output)
            with open(tmp_output, "rb") as opened:
                all_lineages.extend(pickle.load(opened))

    assert len(all_lineages) == len(tax_ids), (
        f"There are {len(tax_ids)} tax_ids, but {len(all_lineages)} lineages are generated, "
        "the two numbers should've been the same"
    )
    return all_lineages


================================================
FILE: ncbitax2lin/ncbitax2lin.py
================================================
"""Converts NCBI taxonomy dump into lineages"""

import logging
import sys
from typing import Dict, Optional

import fire
import pandas as pd

from ncbitax2lin import data_io, fmt, lineage, utils
from ncbitax2lin.struct import TaxUnit

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s|%(levelname)s|%(message)s")


_LOGGER = logging.getLogger(__name__)


def _calc_taxonomy_dict(df_tax: pd.DataFrame) -> Dict[int, TaxUnit]:
    """Converts dataframe of df_tax into a dictionary with tax_id as the keys"""
    return dict(zip(df_tax.tax_id.values, df_tax.to_dict("records")))


def taxonomy_to_lineages(
    nodes_file: str, names_file: str, output: Optional[str] = None
) -> None:
    """Converts NCBI taxomony dump into lineages.

    NCBI taxonomy dump can be downloaded from
    ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz

    Args:
        nodes_file: path/to/taxdump/nodes.dmp from NCBI taxonomy
        names_file: path/to/taxdump/names.dmp from NCBI taxonomy
        output_prefix: output lineages will be written to output_prefix.csv.gz
    """
    df_data = data_io.read_names_and_nodes(names_file, nodes_file)
    _LOGGER.info("# of tax ids: %s", f"{df_data.shape[0]:,d}")
    _LOGGER.info("df.info:\n%s", f"{utils.collect_df_info(df_data)}")

    _LOGGER.info("Generating a dictionary of taxonomy: tax_id => tax_unit ...")
    tax_dict = _calc_taxonomy_dict(df_data)

    tax_dict_size_mb = sys.getsizeof(tax_dict) / 2**20
    _LOGGER.info("size of taxonomy_dict: ~%s MB", f"{tax_dict_size_mb:.0f}")

    tax_ids = df_data.tax_id.to_numpy().tolist()

    _LOGGER.info("Finding all lineages ...")
    all_lineages = lineage.find_all_lineages(tax_ids, tax_dict)

    _LOGGER.info("Preparings all lineages into a dataframe to be written to disk ...")
    df_lineages = fmt.prepare_lineages_for_output(all_lineages)

    if output is None:
        output = f"ncbi_lineages_{pd.Timestamp.utcnow().date()}.csv.gz"

    utils.maybe_backup_file(output)

    _LOGGER.info("Writing lineages to %s ...", output)
    data_io.write_lineages_to_disk(df_lineages, output)


def main() -> None:
    """Main function, entry point"""
    fire.Fire(taxonomy_to_lineages)


================================================
FILE: ncbitax2lin/struct.py
================================================
"""Data strutures."""

from typing import List, NewType, Tuple

from typing_extensions import TypedDict


class TaxUnit(TypedDict):
    """
    Represents a basic unit in taxonomy e.g. (phylum, Proteobacteria), where
    phylum is the rank, and Proteobacteria is the rank name
    """

    tax_id: int
    parent_tax_id: int  # tax_id of parent tax unit for this tax unit
    rank: str
    rank_name: str


# A lineage is a list of (tax_id, rank, rank_name) tuples.
Lineage = NewType("Lineage", List[Tuple[int, str, str]])


================================================
FILE: ncbitax2lin/utils.py
================================================
"""Utility functions"""

import datetime
import functools
import io
import logging
import os
import time
from typing import Any, Callable, List, TypeVar

import pandas as pd

_LOGGER = logging.getLogger(__name__)


def timeit(func: Callable[..., Any]) -> Callable[..., Any]:
    """Times a function, usually used as decorator"""

    @functools.wraps(func)
    def timed_func(*args: Any, **kwargs: Any) -> Any:
        """Returns the timed function"""
        start_time = time.time()
        result = func(*args, **kwargs)
        elapsed_time = datetime.timedelta(seconds=time.time() - start_time)
        _LOGGER.info("time spent on %s: %s", func.__name__, elapsed_time)
        return result

    return timed_func


def maybe_backup_file(filepath: str) -> None:
    """
    Back up a file, old_file will be renamed to #old_file.n#, where n is a
    number incremented each time a backup takes place
    """
    backup = None
    if os.path.exists(filepath):
        dirname = os.path.dirname(filepath)
        basename = os.path.basename(filepath)
        count = 1
        backup = os.path.join(dirname, f"#{basename}.{count}#")
        while os.path.exists(backup):
            count += 1
            backup = os.path.join(dirname, f"#{basename}.{count}#")
        logging.info("Backing up %s to %s", filepath, backup)
        os.rename(filepath, backup)


ElemType = TypeVar("ElemType")  # pylint: disable=invalid-name


def partition(vals: List[ElemType], size: int) -> List[List[ElemType]]:
    """Partion a list into a list of lists by size."""
    return [vals[i : i + size] for i in range(0, len(vals), size)]


def collect_df_info(df_data: pd.DataFrame) -> str:
    """Collects information of a dataframe"""
    buf = io.StringIO()
    df_data.info(buf=buf, verbose=True, memory_usage="deep")
    return buf.getvalue()


================================================
FILE: pylintrc
================================================
[MESSAGES CONTROL]
disable=fixme, duplicate-code


================================================
FILE: pyproject.toml
================================================
[tool.poetry]
name = "ncbitax2lin"
version = "3.0.0"
description = "A tool that converts NCBI taxonomy dump into lineages"
authors = ["Zhuyi Xue <zhuyi.xue@alum.utoronto.ca>"]
readme = "README.md"
homepage = "https://github.com/zyxue/ncbitax2lin"
license = "MIT"

[tool.poetry.dependencies]
fire = "^0.7.1"
pandas = "^2.3.2"
python = "^3.9,<3.14"
typing-extensions = "^4.15.0"

[tool.poetry.dev-dependencies]
autoflake = "^1.3.1"
black = "^22.1.0"
coverage = "^7.5.4"
isort = "^5.7.0"
mypy = "^1.18.2"
pylint = "^3.3.8"
pytest = "^8.4.2"
pytest-parallel = "^0.1.0"
tox = "^3.21.4"

[tool.poetry.scripts]
ncbitax2lin = "ncbitax2lin.ncbitax2lin:main"

[build-system]
requires = ["poetry>=0.12"]
build-backend = "poetry.masonry.api"

# https://pycqa.github.io/isort/docs/configuration/black_compatibility/
[tool.isort]
profile = "black"
multi_line_output = 3
known_first_party = ["ncbitax2lin"]

================================================
FILE: tests/__init__.py
================================================


================================================
FILE: tests/test___init__.py
================================================
"""tests for __init__.py"""

# pylint: disable=protected-access, missing-function-docstring
from ncbitax2lin import __version__


def test_version() -> None:
    assert __version__ == "2.4.1"


================================================
FILE: tests/test_data/names.head_20.dmp
================================================
1	|	all	|		|	synonym	|
1	|	root	|		|	scientific name	|
2	|	Bacteria	|	Bacteria <bacteria>	|	scientific name	|
2	|	Monera	|	Monera <bacteria>	|	in-part	|
2	|	Procaryotae	|	Procaryotae <bacteria>	|	in-part	|
2	|	Prokaryota	|	Prokaryota <bacteria>	|	in-part	|
2	|	Prokaryotae	|	Prokaryotae <bacteria>	|	in-part	|
2	|	bacteria	|		|	blast name	|
2	|	eubacteria	|		|	genbank common name	|
2	|	prokaryote	|	prokaryote <bacteria>	|	in-part	|
2	|	prokaryotes	|	prokaryotes <bacteria>	|	in-part	|
6	|	Azorhizobium	|		|	scientific name	|
6	|	Azorhizobium Dreyfus et al. 1988 emend. Lang et al. 2013	|		|	authority	|
7	|	ATCC 43989	|	ATCC 43989 <type strain>	|	type material	|
7	|	Azorhizobium caulinodans	|		|	scientific name	|
7	|	Azorhizobium caulinodans Dreyfus et al. 1988	|		|	authority	|
7	|	Azotirhizobium caulinodans	|		|	equivalent name	|
7	|	CCUG 26647	|	CCUG 26647 <type strain>	|	type material	|
7	|	DSM 5975	|	DSM 5975 <type strain>	|	type material	|
7	|	IFO 14845	|	IFO 14845 <type strain>	|	type material	|


================================================
FILE: tests/test_data/nodes.head_20.dmp
================================================
1	|	1	|	no rank	|		|	8	|	0	|	1	|	0	|	0	|	0	|	0	|	0	|		|
2	|	131567	|	superkingdom	|		|	0	|	0	|	11	|	0	|	0	|	0	|	0	|	0	|		|
6	|	335928	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
7	|	6	|	species	|	AC	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
9	|	32199	|	species	|	BA	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
10	|	1706371	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
11	|	1707	|	species	|	CG	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
13	|	203488	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
14	|	13	|	species	|	DT	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
16	|	32011	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
17	|	16	|	species	|	MM	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
18	|	213421	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
19	|	18	|	species	|	PC	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
20	|	76892	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
21	|	20	|	species	|	PI	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
22	|	267890	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
23	|	22	|	species	|	SC	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
24	|	22	|	species	|	SP	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
25	|	22	|	species	|	SH	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
27	|	49928	|	species	|	HE	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|


================================================
FILE: tests/test_data_io.py
================================================
"""tests for data_reader.py"""
# pylint: disable=protected-access, missing-function-docstring

from pathlib import Path

import pandas as pd

from ncbitax2lin import data_io


def test_load_nodes() -> None:
    # top 20 lines of nodes.dmp from NCBI
    test_input = (Path(__file__).parent / "./test_data/nodes.head_20.dmp").as_posix()
    actual = data_io.load_nodes(test_input)
    assert isinstance(actual, pd.DataFrame)


def test_load_names() -> None:
    # top 20 lines of names.dmp from NCBI
    test_input = (Path(__file__).parent / "./test_data/names.head_20.dmp").as_posix()
    actual = data_io.load_names(test_input)
    assert isinstance(actual, pd.DataFrame)


================================================
FILE: tests/test_fmt.py
================================================
"""tests for fmt.py"""
# pylint: disable=missing-function-docstring, protected-access
from typing import Container

import pytest

from ncbitax2lin import fmt


@pytest.mark.parametrize(
    "test_input_rank, test_input_existing_ranks, expected",
    [
        ("no rank", {}, "no rank"),
        ("no rank", {"some other rank"}, "no rank"),
        ("no rank", {"no rank"}, "no rank1"),
        ("rankx", ["rankx"], "rankx1"),
        ("rankx", ["rankx", "rankx1"], "rankx2"),
    ],
)
def test__calc_rank_key(
    test_input_rank: str, test_input_existing_ranks: Container[str], expected: str
) -> None:
    actual = fmt._calc_rank_key(test_input_rank, test_input_existing_ranks)
    assert actual == expected


================================================
FILE: tests/test_lineage.py
================================================
"""tests for lineage.py"""
# pylint: disable=missing-function-docstring, protected-access

from unittest.mock import MagicMock, patch

import pytest

from ncbitax2lin import lineage


@patch("multiprocessing.cpu_count", return_value=999, autospec=True)
def test__calc_num_procs(mock_cpu_count: MagicMock) -> None:
    actual = lineage._calc_num_procs()
    expected = 6
    assert actual == expected
    mock_cpu_count.assert_called_once_with()


@pytest.mark.parametrize(
    "num_vals, num_chunks, chunk_size",
    [
        (10, 3, 4),
        (11, 3, 4),
        (12, 3, 4),
        (13, 3, 5),
        (14, 3, 5),
        (15, 3, 5),
        (16, 3, 6),
    ],
)
def test__calc_chunk_size_procs(
    num_vals: int, num_chunks: int, chunk_size: int
) -> None:
    actual = lineage._calc_chunk_size(num_vals, num_chunks)
    expected = chunk_size
    assert actual == expected
    assert isinstance(chunk_size, int)


================================================
FILE: tests/test_ncbitax2lin.py
================================================
"""tests for ncbitax2lin.py"""
# pylint: disable=protected-access, missing-function-docstring


import pandas as pd

from ncbitax2lin import ncbitax2lin


def test__calc_taxonomy_dict() -> None:
    df_data = pd.DataFrame(
        {
            "tax_id": [1, 2, 6],
            "parent_tax_id": [1, 131567, 335928],
            "rank": ["no rank", "superkingdom", "genus"],
            "rank_name": [
                "root",
                "Bacteria",
                "Azorhizobium",
            ],
        }
    )

    actual = ncbitax2lin._calc_taxonomy_dict(df_data)
    expected = {
        1: {"tax_id": 1, "parent_tax_id": 1, "rank": "no rank", "rank_name": "root"},
        2: {
            "tax_id": 2,
            "parent_tax_id": 131567,
            "rank": "superkingdom",
            "rank_name": "Bacteria",
        },
        6: {
            "tax_id": 6,
            "parent_tax_id": 335928,
            "rank": "genus",
            "rank_name": "Azorhizobium",
        },
    }

    assert actual == expected


================================================
FILE: tests/test_utils.py
================================================
"""tests for utils.py"""

# pylint: disable=protected-access, missing-function-docstring

import os
from typing import List
from unittest.mock import MagicMock, call, patch

import pytest

from ncbitax2lin import utils


def test_maybe_backup_file_when_file_path_does_not_exist() -> None:
    with patch("os.path.exists", return_value=False) as mock_exists:
        test_input = "some_non_existing_file"
        utils.maybe_backup_file(test_input)
        mock_exists.assert_called_once_with(test_input)


@patch("os.rename", spec=os.rename)
@patch("os.path.exists")
def test_maybe_backup_file_when_file_path_exists(
    mock_exists: MagicMock, mock_rename: MagicMock
) -> None:
    mock_exists.side_effect = [True, False]
    test_input = "some_existing_file"

    utils.maybe_backup_file(test_input)
    expected = "#some_existing_file.1#"

    mock_exists.assert_has_calls([call(test_input), call(expected)])
    mock_rename.assert_called_once_with(test_input, expected)


@patch("os.rename", spec=os.rename)
@patch("os.path.exists")
def test_maybe_backup_file_when_backfile_also_exists(
    mock_exists: MagicMock, mock_rename: MagicMock
) -> None:
    mock_exists.side_effect = [True, True, False]
    test_input = "some_existing_file"
    intermediary_input = "#some_existing_file.1#"

    utils.maybe_backup_file(test_input)
    expected = "#some_existing_file.2#"

    mock_exists.assert_has_calls(
        [call(test_input), call(intermediary_input), call(expected)]
    )
    mock_rename.assert_called_once_with(test_input, expected)


@pytest.mark.parametrize(
    "test_input, size, expected",
    [
        ([1, 2, 3], 3, [[1, 2, 3]]),
        ([1, 2, 3], 2, [[1, 2], [3]]),
        ([1, 2, 3, 4], 2, [[1, 2], [3, 4]]),
        ([1, 2, 3, 4, 5], 2, [[1, 2], [3, 4], [5]]),
        ([1, 2, 3, 4, 5], 3, [[1, 2, 3], [4, 5]]),
    ],
)
def test__partition(
    test_input: List[int],
    size: int,
    expected: List[List[int]],
) -> None:
    actual = utils.partition(test_input, size)
    assert actual == expected


================================================
FILE: tox.ini
================================================
[tox]
isolated_build = True
envlist = py39,py310,py311,py312,py313

[testenv]
allowlist_externals =
    poetry
    pytest
commands =
    poetry install --verbose