Repository: zyxue/ncbitax2lin Branch: master Commit: 3f97a126721d Files: 26 Total size: 31.4 KB Directory structure: gitextract_ve7hzziz/ ├── .github/ │ └── workflows/ │ └── python-package.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE.txt ├── Makefile ├── README.md ├── mypy.ini ├── ncbitax2lin/ │ ├── __init__.py │ ├── data_io.py │ ├── fmt.py │ ├── lineage.py │ ├── ncbitax2lin.py │ ├── struct.py │ └── utils.py ├── pylintrc ├── pyproject.toml ├── tests/ │ ├── __init__.py │ ├── test___init__.py │ ├── test_data/ │ │ ├── names.head_20.dmp │ │ └── nodes.head_20.dmp │ ├── test_data_io.py │ ├── test_fmt.py │ ├── test_lineage.py │ ├── test_ncbitax2lin.py │ └── test_utils.py └── tox.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/python-package.yml ================================================ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: Python package on: push: branches: [ master ] pull_request: branches: [ master ] jobs: build: runs-on: ubuntu-22.04 strategy: matrix: python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip poetry==1.8.5 poetry install - name: Lint run: | make lint - name: Test run: | make test ================================================ FILE: .gitignore ================================================ .coverage .vscode/ .mypy_cache/ .python-version __pycache__/ dist htmlcov/ ncbitax2lin.egg-info/ build ================================================ FILE: CHANGELOG.md ================================================ ## Change Log ### v3.0.0 (2025/09/23) - Fixed https://github.com/zyxue/ncbitax2lin/issues/31 - Upgraded dependencies and support py39 to py313 instead. ### v2.3.0 (2022/03/20) - Supports Python-3.9 ### v2.2.0 (2022/03/20) - Fixed bug related to sharing global variables among multiple processes. (#14, #15) ### v2.0.2 (2020/05/02) - Made pylint and mypy pass. ### v2.0.1 (2020/05/02) - Adopted [poetry](https://python-poetry.org/) for package management. - Modernized the code (Python-3.7, typing, and some tests). ### v1.1 (2017/03/17) - Remove hosting converted lineages.csv.gz from the repo. - Converted lineages will be versioned and hosted elsewhere. ### v1.0 (2016/04/24) - Organized the code into a release. ================================================ FILE: LICENSE.txt ================================================ MIT License Copyright (c) 2017 zyxue Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ SRC_DIR=ncbitax2lin TESTS_DIR=tests # https://www.gnu.org/software/make/manual/html_node/Force-Targets.html FORCE: format: FORCE poetry run autoflake --recursive --in-place --remove-all-unused-imports $(SRC_DIR) $(TESTS_DIR) \ && poetry run black $(SRC_DIR) $(TESTS_DIR) \ && poetry run isort $(SRC_DIR) $(TESTS_DIR) \ black: FORCE poetry run black --check $(SRC_DIR) $(TESTS_DIR) isort: FORCE poetry run isort --check $(SRC_DIR) $(TESTS_DIR) mypy: FORCE poetry run mypy $(SRC_DIR) $(TESTS_DIR) pylint: FORCE poetry run pylint $(SRC_DIR) $(TESTS_DIR) test: FORCE PYTHONHASHSEED=1 \ && poetry run coverage run --source=$(SRC_DIR) --module pytest --durations=10 --failed-first $(1) \ && poetry run coverage report --show-missing \ && poetry run coverage html lint: black isort mypy pylint all: lint test ================================================ FILE: README.md ================================================ # NCBItax2lin [![Downloads](https://pepy.tech/badge/ncbitax2lin/week)](https://pepy.tech/project/ncbitax2lin) Convert NCBI taxonomy dump into lineages. An example for [human (tax_id=9606)](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606) is like | tax_id | superkingdom | phylum | class | order | family | genus | species | family1 | forma | genus1 | infraclass | infraorder | kingdom | no rank | no rank1 | no rank10 | no rank11 | no rank12 | no rank13 | no rank14 | no rank15 | no rank16 | no rank17 | no rank18 | no rank19 | no rank2 | no rank20 | no rank21 | no rank22 | no rank3 | no rank4 | no rank5 | no rank6 | no rank7 | no rank8 | no rank9 | parvorder | species group | species subgroup | species1 | subclass | subfamily | subgenus | subkingdom | suborder | subphylum | subspecies | subtribe | superclass | superfamily | superorder | superorder1 | superphylum | tribe | varietas | |--------|--------------|----------|----------|----------|-----------|-------|--------------|---------|-------|--------|------------|-------------|---------|--------------------|--------------|----------------------|-----------|-----------|-----------|-----------|---------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|---------------|------------|---------------|------------|--------------|---------------|------------|---------------|------------------|----------|----------|-----------|----------|------------|-------------|-----------|------------|----------|------------|-------------|------------------|-------------|-------------|-------|----------| | 9606 | Eukaryota | Chordata | Mammalia | Primates | Hominidae | Homo | Homo sapiens | | | | | Simiiformes | Metazoa | cellular organisms | Opisthokonta | Dipnotetrapodomorpha | Tetrapoda | Amniota | Theria | Eutheria | Boreoeutheria | | | | | Eumetazoa | | | | Bilateria | Deuterostomia | Vertebrata | Gnathostomata | Teleostomi | Euteleostomi | Sarcopterygii | Catarrhini | | | | | Homininae | | | Haplorrhini | Craniata | | | | Hominoidea | Euarchontoglires | | | | | ### Install ncbitax2lin supports python-3.9 to python-3.13. ``` pip install -U ncbitax2lin ``` It is also available in Conda on the Bioconda channel: ``` conda install bioconda::ncbitax2lin ``` ### Generate lineages First download taxonomy dump from NCBI: ```bash wget -N ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz mkdir -p taxdump && tar zxf taxdump.tar.gz -C ./taxdump ``` Then, run ncbitax2lin ```bash ncbitax2lin --nodes-file taxdump/nodes.dmp --names-file taxdump/names.dmp ``` By default, the generated lineages will be saved to `ncbi_lineages_[date_of_utcnow].csv.gz`. The output file can be overwritten with `--output` option. ## FAQ **Q**: I have a large number of sequences with their corresponding accession numbers from NCBI, how to get their lineages? **A**: First, you need to map accession numbers (GI is deprecated) to tax IDs based on `nucl_*accession2taxid.gz` files from ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/. Secondly, you can trace a sequence's whole lineage based on its tax ID. The tax-id-to-lineage mapping is what NCBItax2lin can generate for you. If you have any question about this project, please feel free to create a new [issue](https://github.com/zyxue/ncbitax2lin/issues/new). ## Note on `taxdump.tar.gz.md5` It appears that NCBI periodically regenerates `taxdump.tar.gz` and `taxdump.tar.gz.md5` even when its content is still the same. I am not sure how their regeneration works, but `taxdump.tar.gz.md5` will differ simply because of a different timestamp. ## Used in * Mahmoudabadi, G., & Phillips, R. (2018). A comprehensive and quantitative exploration of thousands of viral genomes. ELife, 7. https://doi.org/10.7554/eLife.31955 * Dombrowski, N. et al. (2020) Undinarchaeota illuminate DPANN phylogeny and the impact of gene transfer on archaeal evolution, Nature Communications. Springer US, 11(1). doi: 10.1038/s41467-020-17408-w. https://www.nature.com/articles/s41467-020-17408-w * Schenberger Santos, A. R. et al. (2020) NAD+ biosynthesis in bacteria is controlled by global carbon/ nitrogen levels via PII signaling, Journal of Biological Chemistry, 295(18), pp. 6165–6176. doi: 10.1074/jbc.RA120.012793. https://www.sciencedirect.com/science/article/pii/S0021925817482433 * Villada, J. C., Duran, M. F. and Lee, P. K. H. (2020) Interplay between Position-Dependent Codon Usage Bias and Hydrogen Bonding at the 5' End of ORFeomes, mSystems, 5(4), pp. 1–18. doi: 10.1128/msystems.00613-20. https://msystems.asm.org/content/5/4/e00613-20 * Byadgi, O. et al. (2020) Transcriptome analysis of amyloodinium ocellatum tomonts revealed basic information on the major potential virulence factors, Genes, 11(11), pp. 1–12. doi: 10.3390/genes11111252. https://www.mdpi.com/2073-4425/11/11/1252 * Cumbo, F., & Blankenberg, D. (2025). Characterization of microbial dark matter at scale with MetaSBT and taxonomy-aware Sequence Bloom Trees. bioRxiv. https://doi.org/10.1101/2025.08.25.672238 ## Development ### Install dependencies ``` poetry install --sync ``` ### Testing ``` make format make all ``` ### Publish (only for administrator) ``` poetry version [minor/major etc.] git tag vx.y.z git push origin vx.y.z poetry publish --build -u __token__ --password pypi- ``` Update [CHANGELOG.md](/CHANGELOG.md). ================================================ FILE: mypy.ini ================================================ [mypy] python_version = 3.9 disallow_untyped_defs = True ignore_missing_imports = True show_column_numbers = True ================================================ FILE: ncbitax2lin/__init__.py ================================================ """__init__.py for this project""" __version__ = "2.4.1" ================================================ FILE: ncbitax2lin/data_io.py ================================================ """utility functions related to IO""" import pandas as pd from ncbitax2lin import utils def strip(str_: str) -> str: """ :param str_: a string """ return str_.strip() @utils.timeit def load_nodes(nodes_file: str) -> pd.DataFrame: """ load nodes.dmp and convert it into a pandas.DataFrame """ df_data = pd.read_csv( nodes_file, sep="|", header=None, index_col=False, names=[ "tax_id", "parent_tax_id", "rank", "embl_code", "division_id", "inherited_div_flag", "genetic_code_id", "inherited_GC__flag", "mitochondrial_genetic_code_id", "inherited_MGC_flag", "GenBank_hidden_flag", "hidden_subtree_root_flag", "comments", ], ) return df_data.assign( rank=lambda df: df["rank"].apply(strip), embl_code=lambda df: df["embl_code"].apply(strip), comments=lambda df: df["comments"].apply(strip), ) @utils.timeit def load_names(names_file: str) -> pd.DataFrame: """ load names.dmp and convert it into a pandas.DataFrame """ df_data = pd.read_csv( names_file, sep="|", header=None, index_col=False, names=["tax_id", "name_txt", "unique_name", "name_class"], ) return ( df_data.assign( name_txt=lambda df: df["name_txt"].apply(strip), unique_name=lambda df: df["unique_name"].apply(strip), name_class=lambda df: df["name_class"].apply(strip), ) .loc[lambda df: df["name_class"] == "scientific name"] .reset_index(drop=True) ) def read_names_and_nodes(names_file: str, nodes_file: str) -> pd.DataFrame: """Reads in data from names and nodes files""" # data downloaded from ftp://ftp.ncbi.nih.gov/pub/taxonomy/ # args = parse_args() nodes_df = load_nodes(nodes_file) names_df = load_names(names_file) return ( nodes_df.merge(names_df, on="tax_id")[ ["tax_id", "parent_tax_id", "rank", "name_txt"] ] .rename(columns={"name_txt": "rank_name"}) .reset_index(drop=True) ) def write_lineages_to_disk(df_lineages: pd.DataFrame, output_path: str) -> None: """Gzip lineages and write them to disk""" # superkingdom has been renamed to domain in # https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/ domain_col = "domain" # For backwards compatibility with older taxdumps. if "superkingdom" in df_lineages: domain_col = "superkingdom" cols = [ "tax_id", domain_col, "phylum", "class", "order", "family", "genus", "species", ] other_cols = sorted([col for col in df_lineages.columns if col not in cols]) output_cols = cols + other_cols df_lineages.to_csv( output_path, index=False, compression="gzip", columns=output_cols ) ================================================ FILE: ncbitax2lin/fmt.py ================================================ """Utilities for preparing the lineages for output.""" import concurrent.futures from typing import Container, Dict, List, Union import pandas as pd from ncbitax2lin.struct import Lineage def _calc_rank_key(rank: str, existing_ranks: Container[str]) -> str: """Calcluates a key for the lineage representation in a dictionary. Defaults to the rank itself, e.g. no rank, superkingdom, phylum, etc. but when a rank appears multiple times (common for "no rank" rank) in a single linearge it will be numbered, e.g. no rank1, no rank2, and so on. Args: rank: e.g. no rank, superkingdom, phylum, etc. existing_ranks: rank keys already existing """ # e.g. there could be multiple 'no rank' if rank not in existing_ranks: return rank count = 1 numbered_rank = f"{rank}{count}" while numbered_rank in existing_ranks: count += 1 numbered_rank = f"{rank}{count}" return numbered_rank def _convert_lineage_to_dict(lineage: Lineage) -> Dict[str, Union[int, str]]: """Converts the lineage in a list-of-tuples represetantion to a dictionary representation [ ("tax_id1", "rank1", "name_txt1"), ("tax_id2", "rank2", "name_txt2"), ... ] becomes { "rank1": "name_txt1", "rank2": "name_txt2", "tax_id": "tax_id2", # using the last rank as the tax_id of this lineage } A concrete example: [ (131567, 'no rank', 'cellular organisms'), (2, 'superkingdom', 'Bacteria') ] becomes { 'no rank': 'cellular organisms', 'superkingdom': 'Bacteria', 'tax_id': 2, } """ output: Dict[str, Union[int, str]] = {} len_lineage = len(lineage) for k, (tax_id, rank, rank_name) in enumerate(lineage): # use the last rank of the lineage as the tax_id of the lineage if k == len_lineage - 1: output["tax_id"] = tax_id rank_key = _calc_rank_key(rank, output.keys()) output[rank_key] = rank_name return output def prepare_lineages_for_output(lineages: List[Lineage]) -> pd.DataFrame: """prepares lineages into a dataframe for writing to disk""" with concurrent.futures.ProcessPoolExecutor() as executors: out = executors.map(_convert_lineage_to_dict, lineages, chunksize=5000) df_out = pd.DataFrame(out) return df_out.sort_values("tax_id") ================================================ FILE: ncbitax2lin/lineage.py ================================================ """Utilities for finding lineages.""" import logging import math import multiprocessing import os import pickle import tempfile from typing import Dict, List from ncbitax2lin import utils from ncbitax2lin.struct import Lineage, TaxUnit _LOGGER = logging.getLogger(__name__) # tax_id of first line in names.dmp: no rank ROOT_TAX_ID = 1 def _find_one_lineage(tax_id: int, tax_dict: Dict[int, TaxUnit]) -> Lineage: """Finds lineage for a single tax id""" if tax_id % 50000 == 0: # TODO: it's tricky why _LOGGER.info here won't make the log show up. # Note, this function is run in a subprocess. print(f"working on tax_id: {tax_id}") lineage = [] while True: record = tax_dict[tax_id] lineage.append((record["tax_id"], record["rank"], record["rank_name"])) tax_id = record["parent_tax_id"] # every tax can be traced back to tax_id == 1, the root if tax_id == ROOT_TAX_ID: break # reverse results in lineage of Kingdom => species, this is helpful for # to_dict when there are multiple "no rank"s lineage.reverse() return Lineage(lineage) def _find_lineages( tax_ids: List[int], tax_dict: Dict[int, TaxUnit], output: str ) -> None: """Finds lineages for a list of tax ids.""" lineages = [] for tax_id in tax_ids: lineage = _find_one_lineage(tax_id, tax_dict) lineages.append(lineage) with open(output, "wb") as opened: pickle.dump(lineages, opened) def _calc_num_procs(max_num: int = 6) -> int: """Calculates number of the processes to use.""" return min(multiprocessing.cpu_count(), max_num) def _calc_chunk_size(num_vals: int, num_chunks: int) -> int: """Calculates the chunk size.""" return math.ceil(num_vals / num_chunks) def find_all_lineages( tax_ids: List[int], tax_dict: Dict[int, TaxUnit] ) -> List[Lineage]: """Finds the lineages for all tax ids Args: tax_id: all tax ids to find lineages for. tax_dict: a dictionary of tax_id => tax_unit. """ nprocs = _calc_num_procs() _LOGGER.info( "will use %d processes to find lineages for all %s tax ids", nprocs, f"{len(tax_ids):,d}", ) chunk_size = _calc_chunk_size(len(tax_ids), num_chunks=nprocs) _LOGGER.info("chunk_size = %d", chunk_size) tax_id_chunks = utils.partition(tax_ids, size=chunk_size) _LOGGER.info("chunked sizes: %s", [len(_) for _ in tax_id_chunks]) procs, tmp_outputs, all_lineages = [], [], [] with tempfile.TemporaryDirectory(suffix="_ncbitax2lin") as tmpdir: for index, chunk in enumerate(tax_id_chunks): tmp_output = os.path.join(tmpdir, f"_lineages_{index}.pkl") tmp_outputs.append(tmp_output) proc = multiprocessing.Process( target=_find_lineages, args=(chunk, tax_dict, tmp_output) ) procs.append(proc) _LOGGER.info("Starting %d processes ...", len(procs)) for proc in procs: proc.start() _LOGGER.info("Joining %d processes ...", len(procs)) for proc in procs: proc.join() for tmp_output in tmp_outputs: _LOGGER.info("adding lineages from %s ...", tmp_output) with open(tmp_output, "rb") as opened: all_lineages.extend(pickle.load(opened)) assert len(all_lineages) == len(tax_ids), ( f"There are {len(tax_ids)} tax_ids, but {len(all_lineages)} lineages are generated, " "the two numbers should've been the same" ) return all_lineages ================================================ FILE: ncbitax2lin/ncbitax2lin.py ================================================ """Converts NCBI taxonomy dump into lineages""" import logging import sys from typing import Dict, Optional import fire import pandas as pd from ncbitax2lin import data_io, fmt, lineage, utils from ncbitax2lin.struct import TaxUnit logging.basicConfig(level=logging.DEBUG, format="%(asctime)s|%(levelname)s|%(message)s") _LOGGER = logging.getLogger(__name__) def _calc_taxonomy_dict(df_tax: pd.DataFrame) -> Dict[int, TaxUnit]: """Converts dataframe of df_tax into a dictionary with tax_id as the keys""" return dict(zip(df_tax.tax_id.values, df_tax.to_dict("records"))) def taxonomy_to_lineages( nodes_file: str, names_file: str, output: Optional[str] = None ) -> None: """Converts NCBI taxomony dump into lineages. NCBI taxonomy dump can be downloaded from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz Args: nodes_file: path/to/taxdump/nodes.dmp from NCBI taxonomy names_file: path/to/taxdump/names.dmp from NCBI taxonomy output_prefix: output lineages will be written to output_prefix.csv.gz """ df_data = data_io.read_names_and_nodes(names_file, nodes_file) _LOGGER.info("# of tax ids: %s", f"{df_data.shape[0]:,d}") _LOGGER.info("df.info:\n%s", f"{utils.collect_df_info(df_data)}") _LOGGER.info("Generating a dictionary of taxonomy: tax_id => tax_unit ...") tax_dict = _calc_taxonomy_dict(df_data) tax_dict_size_mb = sys.getsizeof(tax_dict) / 2**20 _LOGGER.info("size of taxonomy_dict: ~%s MB", f"{tax_dict_size_mb:.0f}") tax_ids = df_data.tax_id.to_numpy().tolist() _LOGGER.info("Finding all lineages ...") all_lineages = lineage.find_all_lineages(tax_ids, tax_dict) _LOGGER.info("Preparings all lineages into a dataframe to be written to disk ...") df_lineages = fmt.prepare_lineages_for_output(all_lineages) if output is None: output = f"ncbi_lineages_{pd.Timestamp.utcnow().date()}.csv.gz" utils.maybe_backup_file(output) _LOGGER.info("Writing lineages to %s ...", output) data_io.write_lineages_to_disk(df_lineages, output) def main() -> None: """Main function, entry point""" fire.Fire(taxonomy_to_lineages) ================================================ FILE: ncbitax2lin/struct.py ================================================ """Data strutures.""" from typing import List, NewType, Tuple from typing_extensions import TypedDict class TaxUnit(TypedDict): """ Represents a basic unit in taxonomy e.g. (phylum, Proteobacteria), where phylum is the rank, and Proteobacteria is the rank name """ tax_id: int parent_tax_id: int # tax_id of parent tax unit for this tax unit rank: str rank_name: str # A lineage is a list of (tax_id, rank, rank_name) tuples. Lineage = NewType("Lineage", List[Tuple[int, str, str]]) ================================================ FILE: ncbitax2lin/utils.py ================================================ """Utility functions""" import datetime import functools import io import logging import os import time from typing import Any, Callable, List, TypeVar import pandas as pd _LOGGER = logging.getLogger(__name__) def timeit(func: Callable[..., Any]) -> Callable[..., Any]: """Times a function, usually used as decorator""" @functools.wraps(func) def timed_func(*args: Any, **kwargs: Any) -> Any: """Returns the timed function""" start_time = time.time() result = func(*args, **kwargs) elapsed_time = datetime.timedelta(seconds=time.time() - start_time) _LOGGER.info("time spent on %s: %s", func.__name__, elapsed_time) return result return timed_func def maybe_backup_file(filepath: str) -> None: """ Back up a file, old_file will be renamed to #old_file.n#, where n is a number incremented each time a backup takes place """ backup = None if os.path.exists(filepath): dirname = os.path.dirname(filepath) basename = os.path.basename(filepath) count = 1 backup = os.path.join(dirname, f"#{basename}.{count}#") while os.path.exists(backup): count += 1 backup = os.path.join(dirname, f"#{basename}.{count}#") logging.info("Backing up %s to %s", filepath, backup) os.rename(filepath, backup) ElemType = TypeVar("ElemType") # pylint: disable=invalid-name def partition(vals: List[ElemType], size: int) -> List[List[ElemType]]: """Partion a list into a list of lists by size.""" return [vals[i : i + size] for i in range(0, len(vals), size)] def collect_df_info(df_data: pd.DataFrame) -> str: """Collects information of a dataframe""" buf = io.StringIO() df_data.info(buf=buf, verbose=True, memory_usage="deep") return buf.getvalue() ================================================ FILE: pylintrc ================================================ [MESSAGES CONTROL] disable=fixme, duplicate-code ================================================ FILE: pyproject.toml ================================================ [tool.poetry] name = "ncbitax2lin" version = "3.0.0" description = "A tool that converts NCBI taxonomy dump into lineages" authors = ["Zhuyi Xue "] readme = "README.md" homepage = "https://github.com/zyxue/ncbitax2lin" license = "MIT" [tool.poetry.dependencies] fire = "^0.7.1" pandas = "^2.3.2" python = "^3.9,<3.14" typing-extensions = "^4.15.0" [tool.poetry.dev-dependencies] autoflake = "^1.3.1" black = "^22.1.0" coverage = "^7.5.4" isort = "^5.7.0" mypy = "^1.18.2" pylint = "^3.3.8" pytest = "^8.4.2" pytest-parallel = "^0.1.0" tox = "^3.21.4" [tool.poetry.scripts] ncbitax2lin = "ncbitax2lin.ncbitax2lin:main" [build-system] requires = ["poetry>=0.12"] build-backend = "poetry.masonry.api" # https://pycqa.github.io/isort/docs/configuration/black_compatibility/ [tool.isort] profile = "black" multi_line_output = 3 known_first_party = ["ncbitax2lin"] ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/test___init__.py ================================================ """tests for __init__.py""" # pylint: disable=protected-access, missing-function-docstring from ncbitax2lin import __version__ def test_version() -> None: assert __version__ == "2.4.1" ================================================ FILE: tests/test_data/names.head_20.dmp ================================================ 1 | all | | synonym | 1 | root | | scientific name | 2 | Bacteria | Bacteria | scientific name | 2 | Monera | Monera | in-part | 2 | Procaryotae | Procaryotae | in-part | 2 | Prokaryota | Prokaryota | in-part | 2 | Prokaryotae | Prokaryotae | in-part | 2 | bacteria | | blast name | 2 | eubacteria | | genbank common name | 2 | prokaryote | prokaryote | in-part | 2 | prokaryotes | prokaryotes | in-part | 6 | Azorhizobium | | scientific name | 6 | Azorhizobium Dreyfus et al. 1988 emend. Lang et al. 2013 | | authority | 7 | ATCC 43989 | ATCC 43989 | type material | 7 | Azorhizobium caulinodans | | scientific name | 7 | Azorhizobium caulinodans Dreyfus et al. 1988 | | authority | 7 | Azotirhizobium caulinodans | | equivalent name | 7 | CCUG 26647 | CCUG 26647 | type material | 7 | DSM 5975 | DSM 5975 | type material | 7 | IFO 14845 | IFO 14845 | type material | ================================================ FILE: tests/test_data/nodes.head_20.dmp ================================================ 1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | 2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | 6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 10 | 1706371 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 17 | 16 | species | MM | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 18 | 213421 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 19 | 18 | species | PC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 20 | 76892 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 21 | 20 | species | PI | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 22 | 267890 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | 23 | 22 | species | SC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 24 | 22 | species | SP | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 25 | 22 | species | SH | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | 27 | 49928 | species | HE | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | ================================================ FILE: tests/test_data_io.py ================================================ """tests for data_reader.py""" # pylint: disable=protected-access, missing-function-docstring from pathlib import Path import pandas as pd from ncbitax2lin import data_io def test_load_nodes() -> None: # top 20 lines of nodes.dmp from NCBI test_input = (Path(__file__).parent / "./test_data/nodes.head_20.dmp").as_posix() actual = data_io.load_nodes(test_input) assert isinstance(actual, pd.DataFrame) def test_load_names() -> None: # top 20 lines of names.dmp from NCBI test_input = (Path(__file__).parent / "./test_data/names.head_20.dmp").as_posix() actual = data_io.load_names(test_input) assert isinstance(actual, pd.DataFrame) ================================================ FILE: tests/test_fmt.py ================================================ """tests for fmt.py""" # pylint: disable=missing-function-docstring, protected-access from typing import Container import pytest from ncbitax2lin import fmt @pytest.mark.parametrize( "test_input_rank, test_input_existing_ranks, expected", [ ("no rank", {}, "no rank"), ("no rank", {"some other rank"}, "no rank"), ("no rank", {"no rank"}, "no rank1"), ("rankx", ["rankx"], "rankx1"), ("rankx", ["rankx", "rankx1"], "rankx2"), ], ) def test__calc_rank_key( test_input_rank: str, test_input_existing_ranks: Container[str], expected: str ) -> None: actual = fmt._calc_rank_key(test_input_rank, test_input_existing_ranks) assert actual == expected ================================================ FILE: tests/test_lineage.py ================================================ """tests for lineage.py""" # pylint: disable=missing-function-docstring, protected-access from unittest.mock import MagicMock, patch import pytest from ncbitax2lin import lineage @patch("multiprocessing.cpu_count", return_value=999, autospec=True) def test__calc_num_procs(mock_cpu_count: MagicMock) -> None: actual = lineage._calc_num_procs() expected = 6 assert actual == expected mock_cpu_count.assert_called_once_with() @pytest.mark.parametrize( "num_vals, num_chunks, chunk_size", [ (10, 3, 4), (11, 3, 4), (12, 3, 4), (13, 3, 5), (14, 3, 5), (15, 3, 5), (16, 3, 6), ], ) def test__calc_chunk_size_procs( num_vals: int, num_chunks: int, chunk_size: int ) -> None: actual = lineage._calc_chunk_size(num_vals, num_chunks) expected = chunk_size assert actual == expected assert isinstance(chunk_size, int) ================================================ FILE: tests/test_ncbitax2lin.py ================================================ """tests for ncbitax2lin.py""" # pylint: disable=protected-access, missing-function-docstring import pandas as pd from ncbitax2lin import ncbitax2lin def test__calc_taxonomy_dict() -> None: df_data = pd.DataFrame( { "tax_id": [1, 2, 6], "parent_tax_id": [1, 131567, 335928], "rank": ["no rank", "superkingdom", "genus"], "rank_name": [ "root", "Bacteria", "Azorhizobium", ], } ) actual = ncbitax2lin._calc_taxonomy_dict(df_data) expected = { 1: {"tax_id": 1, "parent_tax_id": 1, "rank": "no rank", "rank_name": "root"}, 2: { "tax_id": 2, "parent_tax_id": 131567, "rank": "superkingdom", "rank_name": "Bacteria", }, 6: { "tax_id": 6, "parent_tax_id": 335928, "rank": "genus", "rank_name": "Azorhizobium", }, } assert actual == expected ================================================ FILE: tests/test_utils.py ================================================ """tests for utils.py""" # pylint: disable=protected-access, missing-function-docstring import os from typing import List from unittest.mock import MagicMock, call, patch import pytest from ncbitax2lin import utils def test_maybe_backup_file_when_file_path_does_not_exist() -> None: with patch("os.path.exists", return_value=False) as mock_exists: test_input = "some_non_existing_file" utils.maybe_backup_file(test_input) mock_exists.assert_called_once_with(test_input) @patch("os.rename", spec=os.rename) @patch("os.path.exists") def test_maybe_backup_file_when_file_path_exists( mock_exists: MagicMock, mock_rename: MagicMock ) -> None: mock_exists.side_effect = [True, False] test_input = "some_existing_file" utils.maybe_backup_file(test_input) expected = "#some_existing_file.1#" mock_exists.assert_has_calls([call(test_input), call(expected)]) mock_rename.assert_called_once_with(test_input, expected) @patch("os.rename", spec=os.rename) @patch("os.path.exists") def test_maybe_backup_file_when_backfile_also_exists( mock_exists: MagicMock, mock_rename: MagicMock ) -> None: mock_exists.side_effect = [True, True, False] test_input = "some_existing_file" intermediary_input = "#some_existing_file.1#" utils.maybe_backup_file(test_input) expected = "#some_existing_file.2#" mock_exists.assert_has_calls( [call(test_input), call(intermediary_input), call(expected)] ) mock_rename.assert_called_once_with(test_input, expected) @pytest.mark.parametrize( "test_input, size, expected", [ ([1, 2, 3], 3, [[1, 2, 3]]), ([1, 2, 3], 2, [[1, 2], [3]]), ([1, 2, 3, 4], 2, [[1, 2], [3, 4]]), ([1, 2, 3, 4, 5], 2, [[1, 2], [3, 4], [5]]), ([1, 2, 3, 4, 5], 3, [[1, 2, 3], [4, 5]]), ], ) def test__partition( test_input: List[int], size: int, expected: List[List[int]], ) -> None: actual = utils.partition(test_input, size) assert actual == expected ================================================ FILE: tox.ini ================================================ [tox] isolated_build = True envlist = py39,py310,py311,py312,py313 [testenv] allowlist_externals = poetry pytest commands = poetry install --verbose