Repository: tskit-dev/tskit Branch: main Commit: 40698f504b6e Files: 220 Total size: 7.3 MB Directory structure: gitextract_7z1tql5y/ ├── .clang-format ├── .github/ │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ ├── docs.yml │ ├── lint.yml │ ├── release-c.yml │ ├── tests.yml │ └── wheels.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── c/ │ ├── .gitignore │ ├── CHANGELOG.rst │ ├── VERSION.txt │ ├── examples/ │ │ ├── Makefile │ │ ├── api_structure.c │ │ ├── cpp_sorting_example.cpp │ │ ├── error_handling.c │ │ ├── haploid_wright_fisher.c │ │ ├── json_struct_metadata.c │ │ ├── multichrom_wright_fisher.c │ │ ├── multichrom_wright_fisher_singlethreaded.c │ │ ├── streaming.c │ │ ├── take_ownership.c │ │ ├── tree_iteration.c │ │ └── tree_traversal.c │ ├── meson.build │ ├── meson_options.txt │ ├── subprojects/ │ │ └── kastore/ │ │ ├── README.md │ │ ├── VERSION.txt │ │ ├── kastore.c │ │ ├── kastore.h │ │ └── meson.build │ ├── tests/ │ │ ├── meson-subproject/ │ │ │ ├── example.c │ │ │ └── meson.build │ │ ├── test_convert.c │ │ ├── test_core.c │ │ ├── test_file_format.c │ │ ├── test_genotypes.c │ │ ├── test_haplotype_matching.c │ │ ├── test_minimal_cpp.cpp │ │ ├── test_stats.c │ │ ├── test_tables.c │ │ ├── test_trees.c │ │ ├── testlib.c │ │ └── testlib.h │ ├── tskit/ │ │ ├── convert.c │ │ ├── convert.h │ │ ├── core.c │ │ ├── core.h │ │ ├── genotypes.c │ │ ├── genotypes.h │ │ ├── haplotype_matching.c │ │ ├── haplotype_matching.h │ │ ├── stats.c │ │ ├── stats.h │ │ ├── tables.c │ │ ├── tables.h │ │ ├── trees.c │ │ └── trees.h │ └── tskit.h ├── codecov.yml ├── docs/ │ ├── .gitignore │ ├── Makefile │ ├── _config.yml │ ├── _static/ │ │ ├── README │ │ └── bespoke.css │ ├── _toc.yml │ ├── build.sh │ ├── c-api.rst │ ├── changelogs.rst │ ├── citation.md │ ├── cli.md │ ├── data/ │ │ └── basic_tree_seq.trees │ ├── data-model.md │ ├── development.md │ ├── doxygen/ │ │ └── Doxyfile │ ├── export.md │ ├── file-formats.md │ ├── glossary.md │ ├── ibd.md │ ├── installation.md │ ├── introduction.md │ ├── metadata.md │ ├── numba.md │ ├── provenance.md │ ├── python-api.md │ ├── quickstart.md │ ├── stats.md │ ├── substitutions/ │ │ ├── linear_traversal_warning.rst │ │ ├── table_edit_warning.rst │ │ ├── table_keep_rows_main.rst │ │ ├── tree_array_warning.rst │ │ └── virtual_root_array_note.rst │ └── topological-analysis.md ├── prek.toml └── python/ ├── .gitignore ├── CHANGELOG.rst ├── MANIFEST.in ├── Makefile ├── README.rst ├── _tskitmodule.c ├── benchmark/ │ ├── config.yaml │ ├── run-for-all-releases.py │ └── run.py ├── lwt_interface/ │ ├── CHANGELOG.rst │ ├── Makefile │ ├── README.md │ ├── cython_example/ │ │ ├── Makefile │ │ ├── _lwtc.c │ │ ├── example.pyx │ │ ├── pyproject.toml │ │ └── setup.py │ ├── dict_encoding_testlib.py │ ├── example_c_module.c │ ├── setup.py │ ├── test_example_c_module.py │ └── tskit_lwt_interface.h ├── pyproject.toml ├── setup.py ├── stress_lowlevel.py ├── tests/ │ ├── __init__.py │ ├── conftest.py │ ├── data/ │ │ ├── SLiM/ │ │ │ ├── README │ │ │ ├── minimal-example.trees │ │ │ ├── minimal-example.txt │ │ │ ├── single-locus-example.trees │ │ │ └── single-locus-example.txt │ │ ├── dict-encodings/ │ │ │ ├── generate_msprime.py │ │ │ └── msprime-0.7.4.pkl │ │ ├── hdf5-formats/ │ │ │ ├── msprime-0.3.0_v2.0.hdf5 │ │ │ ├── msprime-0.4.0_v3.1.hdf5 │ │ │ └── msprime-0.5.0_v10.0.hdf5 │ │ ├── old-formats/ │ │ │ └── tskit-0.3.3.trees │ │ └── simplify-bugs/ │ │ ├── 01-edges.txt │ │ ├── 01-mutations.txt │ │ ├── 01-nodes.txt │ │ ├── 01-sites.txt │ │ ├── 02-edges.txt │ │ ├── 02-mutations.txt │ │ ├── 02-nodes.txt │ │ ├── 02-sites.txt │ │ ├── 03-edges.txt │ │ ├── 03-mutations.txt │ │ ├── 03-nodes.txt │ │ ├── 03-sites.txt │ │ ├── 04-edges.txt │ │ ├── 04-mutations.txt │ │ ├── 04-nodes.txt │ │ ├── 04-sites.txt │ │ ├── 05-edges.txt │ │ ├── 05-mutations.txt │ │ ├── 05-nodes.txt │ │ └── 05-sites.txt │ ├── ibd.py │ ├── simplify.py │ ├── test_avl_tree.py │ ├── test_balance_metrics.py │ ├── test_cli.py │ ├── test_coalrate.py │ ├── test_combinatorics.py │ ├── test_dict_encoding.py │ ├── test_distance_metrics.py │ ├── test_divmat.py │ ├── test_drawing.py │ ├── test_extend_haplotypes.py │ ├── test_file_format.py │ ├── test_fileobj.py │ ├── test_genotype_matching.py │ ├── test_genotypes.py │ ├── test_haplotype_matching.py │ ├── test_highlevel.py │ ├── test_ibd.py │ ├── test_immutable_table_collection.py │ ├── test_intervals.py │ ├── test_jit.py │ ├── test_ld_matrix.py │ ├── test_metadata.py │ ├── test_ms.py │ ├── test_parsimony.py │ ├── test_phylo_formats.py │ ├── test_provenance.py │ ├── test_python_c.py │ ├── test_reference_sequence.py │ ├── test_relatedness_vector.py │ ├── test_stats.py │ ├── test_table_transforms.py │ ├── test_tables.py │ ├── test_text_formats.py │ ├── test_threads.py │ ├── test_topology.py │ ├── test_tree_positioning.py │ ├── test_tree_stats.py │ ├── test_util.py │ ├── test_utilities.py │ ├── test_vcf.py │ ├── test_version.py │ ├── test_wright_fisher.py │ └── tsutil.py └── tskit/ ├── __init__.py ├── __main__.py ├── _version.py ├── cli.py ├── combinatorics.py ├── drawing.py ├── exceptions.py ├── genotypes.py ├── intervals.py ├── jit/ │ ├── __init__.py │ └── numba.py ├── metadata.py ├── metadata_schema.schema.json ├── provenance.py ├── provenance.schema.json ├── stats.py ├── tables.py ├── text_formats.py ├── trees.py ├── util.py └── vcf.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ Language: Cpp BasedOnStyle: GNU SortIncludes: false AllowShortIfStatementsOnASingleLine: false BreakBeforeBraces: Linux TabWidth: 4 IndentWidth: 4 ColumnLimit: 89 SpaceBeforeParens: ControlStatements SpacesInCStyleCastParentheses: false SpaceAfterCStyleCast: true IndentCaseLabels: true AlignAfterOpenBracket: DontAlign BinPackArguments: true BinPackParameters: true AlwaysBreakAfterReturnType: AllDefinitions StatementMacros: ["PyObject_HEAD", "Py_BEGIN_ALLOW_THREADS", "Py_END_ALLOW_THREADS"] AlignConsecutiveMacros: true ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ## Description Thanks for contributing to tskit! :heart: A guide to the PR process is [here](https://tskit.dev/tskit/docs/stable/development.html#git-workflow) Please replace this text with a summary of the change and which issue is fixed, if any. Please also include relevant motivation and context. Fixes #(issue) <- Putting the issue number here will auto-close the issue when this PR is merged # PR Checklist: - [ ] Tests that fully cover new/changed functionality. - [ ] Documentation including tutorial content if appropriate. - [ ] Changelogs, if there are API changes. ================================================ FILE: .github/workflows/docs.yml ================================================ name: Build Docs on: pull_request: merge_group: push: branches: [main] tags: - '*' env: FORCE_COLOR: 1 jobs: Docs: uses: tskit-dev/.github/.github/workflows/docs.yml@v15 with: pyproject-directory: python additional-apt-packages: doxygen pre-build-command: cd docs/doxygen && doxygen ================================================ FILE: .github/workflows/lint.yml ================================================ name: Lint on: pull_request: merge_group: jobs: Lint: uses: tskit-dev/.github/.github/workflows/lint.yml@v15 with: pyproject-directory: python ================================================ FILE: .github/workflows/release-c.yml ================================================ name: Publish C API release on: push: branches: [main, test] tags: ['*'] env: FORCE_COLOR: 1 jobs: build: runs-on: ubuntu-24.04 steps: - name: Checkout uses: actions/checkout@v6.0.2 - name: Install uv uses: astral-sh/setup-uv@v6 with: version: "0.10.0" - name: Install system deps run: | sudo apt-get update sudo apt-get install -y ninja-build libcunit1-dev - name: Install meson run: uv tool install meson==1.10.1 - name: Build tarball run: | git rm -rf c/tests/meson-subproject git config --global user.email "CI@CI.com" git config --global user.name "Mr Robot" git add -A git commit -m "dummy commit to make meson not add in the symlinked directory" meson c build-gcc meson dist -C build-gcc - name: C Release uses: softprops/action-gh-release@v2.5.0 if: startsWith(github.ref, 'refs/tags/') && contains(github.event.ref, 'C_') with: draft: True files: build-gcc/meson-dist/* ================================================ FILE: .github/workflows/tests.yml ================================================ name: Tests on: pull_request: merge_group: push: branches: [main, test] env: FORCE_COLOR: 1 jobs: packaging: name: Python packaging uses: tskit-dev/.github/.github/workflows/python-packaging.yml@v15 with: pyproject-directory: python cli-test-cmd: tskit --help test-c: name: C tests uses: tskit-dev/.github/.github/workflows/c-tests.yml@v15 with: library-directory: c secrets: inherit test-python-c: name: Python-C tests uses: tskit-dev/.github/.github/workflows/python-c-tests.yml@v15 with: tests: python/tests/test_python_c.py python/tests/test_dict_encoding.py pyproject-directory: python secrets: inherit test: name: Python uses: tskit-dev/.github/.github/workflows/python-tests.yml@v15 with: os: ${{ matrix.os }} python-version: ${{ matrix.python }} pyproject-directory: python coverage-directory: python/tskit secrets: inherit strategy: matrix: python: [ 3.11, 3.13 ] os: [ macos-latest, ubuntu-24.04, windows-latest ] msys2: runs-on: windows-latest strategy: matrix: include: - { sys: mingw32, env: i686 } - { sys: mingw64, env: x86_64 } name: Windows (${{ matrix.sys }}, ${{ matrix.env }}) defaults: run: shell: msys2 {0} steps: - name: Cancel Previous Runs uses: styfle/cancel-workflow-action@0.13.0 with: access_token: ${{ github.token }} - name: 'Checkout' uses: actions/checkout@v6.0.2 - name: Setup MSYS2 ${{matrix.sys}} uses: msys2/setup-msys2@v2.27.0 with: msystem: ${{matrix.sys}} update: true install: >- git mingw-w64-${{matrix.env}}-toolchain mingw-w64-${{matrix.env}}-ninja mingw-w64-${{matrix.env}}-meson mingw-w64-${{matrix.env}}-cunit - name: Build working-directory: c run: | meson build -Dbuild_examples=false ninja -C build - name: Run tests working-directory: c run: | ninja -C build test bespoke-python-test: name: Bespoke Python tests runs-on: ubuntu-24.04 steps: - name: Cancel Previous Runs uses: styfle/cancel-workflow-action@0.13.0 with: access_token: ${{ github.token }} - name: Checkout uses: actions/checkout@v6.0.2 with: submodules: true - name: Install uv and set the python version uses: astral-sh/setup-uv@v6 with: python-version: 3.11 version: "0.10.0" - name: Install Python dependencies working-directory: python run: uv sync --locked --group test --no-default-groups - name: Minidom test working-directory: python # Importing either IPython or pytest causes import of xml.dom.minidom # So to actually test that tskit imports it, we need a minimal test run: | uv run --locked --group test --no-default-groups \ python -c "import tskit;tskit.Tree.generate_star(5).tree_sequence.draw_svg(path='test.svg')" - name: Run JIT code coverage run: | NUMBA_DISABLE_JIT=1 uv run --locked --project=python --no-default-groups\ pytest --cov=python/tskit --cov-report=xml --cov-branch \ python/tests/test_jit.py - name: Upload coverage to Codecov uses: codecov/codecov-action@v5.5.2 with: token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: true files: coverage.xml disable_search: true verbose: true flags: python-tests-no-jit - name: Build example LWT interface code and test working-directory: python/lwt_interface/ run: | make allchecks uv run --project=../ --group=test pytest -vs - name: Build cython example LWT interface code and run working-directory: python/lwt_interface/cython_example run: make bespoke-c-test: name: Bespoke C tests runs-on: ubuntu-24.04 steps: - name: Cancel Previous Runs uses: styfle/cancel-workflow-action@0.13.0 with: access_token: ${{ github.token }} - name: Checkout uses: actions/checkout@v6.0.2 with: submodules: true - name: Install system deps run: | sudo apt-get update sudo apt-get install -y libcunit1-dev ninja-build clang - name: Install uv uses: astral-sh/setup-uv@v6 with: version: "0.10.0" - name: Install uv deps run: | uv tool install meson==1.10.1 - name: Configure code run: CFLAGS=-D_TSK_BIG_TABLES CPPFLAGS=-D_TSK_BIG_TABLES meson setup build-bt c/ - name: Compile run: ninja -C build-bt - name: Run tests run: ninja -C build-bt test - name: Test building with meson subproject run: | meson build-subproject c/tests/meson-subproject ninja -C build-subproject ./build-subproject/example - name: Install shared library and hand-compile program. run: | meson build-install c --prefix=/usr sudo ninja -C build-install install clang c/examples/api_structure.c -I c/subprojects/kastore -o api_structure -ltskit ./api_structure - name: Run example make file run: | make -C c/examples ================================================ FILE: .github/workflows/wheels.yml ================================================ name: Publish Python release on: push: branches: [test-publish] release: types: [published] jobs: build-wheels: if: "!startsWith(github.ref, 'refs/tags/C_')" uses: tskit-dev/.github/.github/workflows/build-wheels.yml@v15 with: pyproject-directory: python publish: runs-on: ubuntu-24.04 environment: release needs: [ 'build-wheels' ] permissions: id-token: write steps: - name: Download artifacts uses: actions/download-artifact@v7.0.0 with: pattern: build-* path: dist merge-multiple: true - name: Show artifacts run: ls -lah dist - name: Publish distribution to Test PyPI if: github.event_name == 'push' && github.ref_name == 'test-publish' uses: pypa/gh-action-pypi-publish@v1.13.0 with: repository-url: https://test.pypi.org/legacy/ verbose: true - name: Publish distribution to Production PyPI if: github.event_name == 'release' uses: pypa/gh-action-pypi-publish@v1.13.0 ================================================ FILE: .gitignore ================================================ build-gcc .DS_Store python/benchmark/*.trees python/benchmark/*.json python/benchmark/*.html .venv .env .vscode env ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing Tskit is a free and open-source project that welcomes contributions from everyone. The [Developer documentation](https://tskit.dev/tskit/docs/latest/development.html) will help you get started. We have an active slack group where tskit and associated projects are discussed. If you wish to join email [admin@tskit.dev](mailto:admin@tskit.dev). We ask all users to follow our [code of conduct](https://github.com/tskit-dev/.github/blob/main/CODE_OF_CONDUCT.md) when interacting with the project. ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2018-2019 Tskit Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # tskit [![License](https://img.shields.io/github/license/tskit-dev/tskit)](https://github.com/tskit-dev/tskit/blob/main/LICENSE) [![Contributors](https://img.shields.io/github/contributors/tskit-dev/tskit)](https://github.com/tskit-dev/tskit/graphs/contributors) [![Commit activity](https://img.shields.io/github/commit-activity/m/tskit-dev/tskit)](https://github.com/tskit-dev/tskit/commits/main) [![Coverage](https://codecov.io/gh/tskit-dev/tskit/branch/main/graph/badge.svg)](https://codecov.io/gh/tskit-dev/tskit) ![OS](https://img.shields.io/badge/OS-linux%20%7C%20OSX%20%7C%20win--64-steelblue) [Documentation (stable)](https://tskit.dev/tskit/docs/stable/) • [Documentation (latest)](https://tskit.dev/tskit/docs/latest/) [![Docs Build](https://github.com/tskit-dev/tskit/actions/workflows/docs.yml/badge.svg)](https://github.com/tskit-dev/tskit/actions/workflows/docs.yml)[![Tests](https://github.com/tskit-dev/tskit/actions/workflows/tests.yml/badge.svg)](https://github.com/tskit-dev/tskit/actions/workflows/tests.yml) The succinct tree sequence (`tskit`) format is an efficient way of representing the genetic history - sometimes known as an [Ancestral Recombination Graph or ARG](https://doi.org/10.1093/genetics/iyae100) - of a set of related DNA sequences. `Tskit` is used by a number of software libraries and programs (such as [msprime](https://github.com/tskit-dev/msprime), [SLiM](https://github.com/MesserLab/SLiM), [fwdpp](http://molpopgen.github.io/fwdpp/), and [tsinfer](https://tskit.dev/tsinfer/docs/stable/)) that either simulate or infer the evolutionary ancestry of genetic sequences. The `tskit` library provides the underlying functionality used to load, examine, and manipulate ARGs in the tree sequence format, including efficient access to the sequence of correlated trees along a genome and general methods to calculate genetic statistics. `Tskit` often forms part of an installation of other software packages such as those listed above. Please see the [documentation](https://tskit.dev/tskit/docs/stable/) for further details, which includes [installation instructions](https://tskit.dev/tskit/docs/stable/installation.html). To get started with tskit, tutorials and other content are at http://tskit.dev. For help and support from the community you can use [discussions](https://github.com/tskit-dev/tskit/discussions) here on github, or raise an issue for a specific bug or feature request. We warmly welcome contributions from the community. Raise an issue if you have an idea you'd like to work on, or submit a PR for comments and help. The base `tskit` library provides both a [Python](https://tskit.dev/tskit/docs/stable/python-api.html) and [C](https://tskit.dev/tskit/docs/stable/c-api.html) API. A Rust API is provided in the [tskit-rust](https://github.com/tskit-dev/tskit-rust) repository. #### Python API [![PyPI version](https://img.shields.io/pypi/v/tskit.svg)](https://pypi.org/project/tskit/) [![Supported Python Versions](https://img.shields.io/pypi/pyversions/tskit.svg)](https://pypi.org/project/tskit/) [![Wheel](https://img.shields.io/pypi/wheel/tskit)](https://pypi.org/project/tskit/) [![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) Most users of `tskit` will use the python API as it provides a convenient, high-level API to access, analyse and create tree sequences. Full documentation is [here](https://tskit.dev/tskit/docs/stable/python-api.html). #### C API [![C99](https://img.shields.io/badge/Language-C99-steelblue.svg)](https://en.wikipedia.org/wiki/C99) The `tskit` C API provides comprehensive, low-level methods for manipulating and processing tree-sequences. Written to the C99 standard and fully thread-safe, it can be used with either C or C++. Full documentation is [here](https://tskit.dev/tskit/docs/stable/c-api.html). ## Installation ```bash python -m pip install tskit # or conda install -c conda-forge tskit ``` ================================================ FILE: c/.gitignore ================================================ build .*.swp .*.swo ================================================ FILE: c/CHANGELOG.rst ================================================ -------------------- [1.3.2] - 2026-XX-XX -------------------- In development - Add ``tsk_json_struct_metadata_get_blob`` function (:user:`benjeffery`, :pr:`3306`) -------------------- [1.3.1] - 2026-03-06 -------------------- Maintenance release. - Update to kastore 2.1.2 - Fix doc typo for file uuid (:pr:`3399`) - Migrate linting to clang-format 21.1.8 (:pr:`3389`) - Support compile time setting of debug stream (:pr:`3364`) -------------------- [1.3.0] - 2025-11-27 -------------------- **Breaking changes** - ``trees.c`` now depends on ``genotypes.c`` (via ``tskit/genotypes.h``) and must be built and linked together with it. (:user:`benjeffery`, :pr:`3324`) **Features** - ``tsk_variant_init`` and associated variant decoding methods now fully support ``TSK_ISOLATED_NOT_MISSING`` not being set for internal nodes. (:user:`benjeffery`, :pr:`3313`) - Add ``tsk_treeseq_decode_alignments`` to decode full-length reference-based sequence alignments for specified nodes over a genomic interval, respecting ``TSK_ISOLATED_NOT_MISSING`` semantics. (:user:`benjeffery`, :pr:`3324`, :issue:`3319`) -------------------- [1.2.0] - 2025-09-24 -------------------- **Breaking changes** - Remove ``tsk_diff_iter_t`` and associated functions. (:user:`benjeffery`, :pr:`3221`, :issue:`2797`). - ``tsk_treeseq_init`` now requires that mutation parents in the table collection are correct and consistent with the topology of the tree at each mutation site. Returns ``TSK_ERR_BAD_MUTATION_PARENT`` if this is not the case, or ``TSK_ERR_MUTATION_PARENT_AFTER_CHILD`` if the mutations are not in an order compatible with the correct mutation parent. (:user:`benjeffery`, :issue:`2729`, :issue:`2732`, :pr:`3212`). **Features** - Add ``TSK_TS_INIT_COMPUTE_MUTATION_PARENTS`` to ``tsk_treeseq_init`` to compute mutation parents from the tree sequence topology. Note that the mutations must be in the correct order. (:user:`benjeffery`, :issue:`2757`, :pr:`3212`). - Add ``TSK_CHECK_MUTATION_PARENTS`` option to ``tsk_table_collection_check_integrity`` to check that mutation parents are consistent with the tree sequence topology. This option implies ``TSK_CHECK_TREES``. (:user:`benjeffery`, :issue:`2729`, :issue:`2732`, :pr:`3212`). - Add the ``TSK_NO_CHECK_INTEGRITY`` option to ``tsk_table_collection_compute_mutation_parents`` to skip the integrity checks that are normally run when computing mutation parents. This is useful for speeding up the computation of mutation parents when the tree sequence is certainly known to be valid. (:user:`benjeffery`, :pr:`3212`). - Mutations returned by ``tsk_treeseq_get_mutation`` now include pre-computed ``inherited_state`` and ``inherited_state_length`` fields. The inherited state is computed during tree sequence initialization and represents the state that existed at the site before each mutation occurred (either the ancestral state if the mutation is the root mutation or the derived state of the parent mutation). Note that this breaks ABI compatibility due to the addition of these fields to the ``tsk_mutation_t`` struct. (:user:`benjeffery`, :pr:`3277`, :issue:`2631`). -------------------- [1.1.4] - 2025-03-31 -------------------- **Changes** - Added the TSK_TRACE_ERRORS macro to enable tracing of errors in the C library. This is useful for debugging as errors will print to stderr when set. (:user:`jeromekelleher`, :pr:`3095`). -------------------- [1.1.3] - 2024-10-16 -------------------- **Features** - Add the `tsk_treeseq_extend_haplotypes` method that can compress a tree sequence by extending edges into adjacent trees and thus creating unary nodes in those trees (:user:`petrelharp`, :user:`hfr1tze`, :user:`avabamf`, :pr:`2651`, :pr:`2938`). -------------------- [1.1.2] - 2023-05-17 -------------------- **Performance improvements** - tsk_tree_seek is now much faster at seeking to arbitrary points along the sequence from the null tree (:user:`molpopgen`, :pr:`2661`). **Features** - The struct ``tsk_treeseq_t`` now has the variables ``min_time`` and ``max_time``, which are the minimum and maximum among the node times and mutation times, respectively. ``min_time`` and ``max_time`` can be accessed using the functions ``tsk_treeseq_get_min_time`` and ``tsk_treeseq_get_max_time``, respectively. (:user:`szhan`, :pr:`2612`, :issue:`2271`) - Add the `TSK_SIMPLIFY_NO_FILTER_NODES` option to simplify to allow unreferenced nodes be kept in the output (:user:`jeromekelleher`, :user:`hyanwong`, :issue:`2606`, :pr:`2619`). - Add the `TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` option to simplify which ensures no node sample flags are changed to allow calling code to manage sample status. (:user:`jeromekelleher`, :issue:`2662`, :pr:`2663`). - Guarantee that unfiltered tables are not written to unnecessarily during simplify (:user:`jeromekelleher`, :pr:`2619`). - Add `x_table_keep_rows` methods to provide efficient in-place table subsetting (:user:`jeromekelleher`, :pr:`2700`). - Add `tsk_tree_seek_index` function -------------------- [1.1.1] - 2022-07-29 -------------------- **Bug fixes** - Fix segfault in tsk_variant_restricted_copy in tree sequences with large numbers of alleles or very long alleles (:user:`jeromekelleher`, :pr:`2437`, :issue:`2429`). -------------------- [1.1.0] - 2022-07-14 -------------------- **Features** - Add ``num_children`` to ``tsk_tree_t`` an array which contains counts of the number of child nodes of each node in the tree. (:user:`GertjanBisschop`, :issue:`2274`, :pr:`2316`) - Add ``edge`` to ``tsk_tree_t`` an array which contains the ``edge_id`` of the edge encoding the relationship between the child node and its parent for each (child) node in the tree. (:user:`GertjanBisschop`, :issue:`2304`, :pr:`2340`) **Changes** - Reduce the maximum number of rows in a table by 1. This removes edge cases so that a ``tsk_id_t`` can be used to count the number of rows. (:user:`benjeffery`, :issue:`2336`, :pr:`2337`) - Samples are now copied by ``tsk_variant_restricted_copy``. (:user:`benjeffery`, :issue:`2400`, :pr:`2401`) -------------------- [1.0.0] - 2022-05-24 -------------------- This major release marks the point at which the documented API becomes stable and supported. **Breaking changes** - Change the type of genotypes to ``int32_t``, removing the TSK_16_BIT_GENOTYPES flag option. (:user:`benjeffery`, :issue:`463`, :pr:`2108`) - ``tsk_variant_t`` now includes its ``tsk_site_t`` rather than pointing to it. (:user:`benjeffery`, :issue:`2161`, :pr:`2162`) - Rename ``TSK_TAKE_TABLES`` to ``TSK_TAKE_OWNERSHIP``. (:user:`benjeffery`, :issue:`2221`, :pr:`2222`) - ``TSK_DEBUG``, ``TSK_NO_INIT``, ``TSK_NO_CHECK_INTEGRITY`` and ``TSK_TAKE_OWNERSHIP`` have moved to ``core.h`` (:user:`benjeffery`, :issue:`2218`, :pr:`2230`)) - Rename several flags: - All flags to ``simplify`` for example ``TSK_KEEP_INPUT_ROOTS`` becomes ``TSK_SIMPLIFY_KEEP_INPUT_ROOTS``. - All flags to ``subset`` for example ``TSK_KEEP_UNREFERENCED`` becomes ``TSK_SUBSET_KEEP_UNREFERENCED``. - ``TSK_BUILD_INDEXES`` -> ``TSK_TS_INIT_BUILD_INDEXES`` - ``TSK_NO_METADATA`` -> ``TSK_TABLE_NO_METADATA`` - ``TSK_NO_EDGE_METADATA`` -> ``TSK_TC_NO_EDGE_METADATA`` (:user:`benjeffery`, :issue:`1720`, :pr:`2226`, :pr:`2229`, :pr:`2224`) - Remove the generic ``TSK_ERR_OUT_OF_BOUNDS`` - replacing with specific errors. Remove ``TSK_ERR_NON_SINGLE_CHAR_MUTATION`` which was unused. (:user:`benjeffery`, :pr:`2260`) - Reorder stats API methods to place ``result`` as the last argument. (:user:`benjeffery`, :pr:`2292`, :issue:`2285`) **Features** - Make dumping of tables and tree sequences to disk a zero-copy operation. (:user:`benjeffery`, :issue:`2111`, :pr:`2124`) - Add ``edge`` attribute to ``mutation_t`` struct and make available in tree sequence. (:user:`jeromekelleher`, :issue:`685`, :pr:`2279`) - Reduce peak memory usage in ``tsk_treeseq_simplify``. (:user:`jeromekelleher`, :issue:`2287`, :pr:`2288`) ---------------------- [0.99.15] - 2021-12-07 ---------------------- **Breaking changes** - The ``tables`` argument to ``tsk_treeseq_init`` is no longer ``const``, to allow for future no-copy tree sequence creation. (:user:`benjeffery`, :issue:`1718`, :pr:`1719`) - Additional consistency checks for mutation tables are now run by ``tsk_table_collection_check_integrity`` even when ``TSK_CHECK_MUTATION_ORDERING`` is not passed in. (:user:`petrelharp`, :issue:`1713`, :pr:`1722`) - ``num_tracked_samples`` and ``num_samples`` in ``tsk_tree_t`` are now typed as ``tsk_size_t`` (:user:`benjeffery`, :issue:`1723`, :pr:`1727`) - The previously deprecated option ``TSK_SAMPLE_COUNTS`` has been removed. (:user:`benjeffery`, :issue:`1744`, :pr:`1761`). - Individuals are no longer guaranteed or required to be topologically sorted in a tree sequence. ``tsk_table_collection_sort`` no longer sorts individuals. (:user:`benjeffery`, :issue:`1774`, :pr:`1789`) - The ``tsk_tree_t.left_root`` member has been removed. Client code can be updated most easily by using the equivalent ``tsk_tree_get_left_root`` function. However, it may be worth considering updating code to use either the standard traversal functions (which automatically iterate over roots) or to use the ``virtual_root`` member (which may lead to more concise code). (:user:`jeromekelleher`, :issue:`1796`, :pr:`1862`) - Rename ``tsk_tree_t.left`` and ``tsk_tree_t.right`` members to ``tsk_tree_t.interval.left`` and ``tsk_tree_t.interval.right`` respectively. (:user:`jeromekelleher`, :issue:`1686`, :pr:`1913`) - ``kastore`` is now vendored into this repo instead of being a git submodule. Developers need to run ``git submodule update``. (:user:`jeromekelleher`, :issue:`1687`, :pr:`1973`) - ``Tree`` arrays such as ``left_sib``, ``right_child`` etc. now have an additional "virtual root" node at the end. (:user:`jeromekelleher`, :issue:`1691`, :pr:`1704`) - ``marked`` and ``mark`` have been removed from ``tsk_tree_t``. (:user:`jeromekelleher`, :pr:`1936`) **Features** - Add ``tsk_table_collection_individual_topological_sort`` to sort the individuals as this is no longer done by the default sort. (:user:`benjeffery`, :issue:`1774`, :pr:`1789`) - The default behaviour for table size growth is now to double the current size of the table, up to a threshold. To keep the previous behaviour, use (e.g.) ``tsk_edge_table_set_max_rows_increment(tables->edges, 1024)``, which results in adding space for 1024 additional rows each time we run out of space in the edge table. (:user:`benjeffery`, :issue:`5`, :pr:`1683`) - ``tsk_table_collection_check_integrity`` now has a ``TSK_CHECK_MIGRATION_ORDERING`` flag. (:user:`petrelharp`, :pr:`1722`) - The default behaviour for ragged column growth is now to double the current size of the column, up to a threshold. To keep the previous behaviour, use (e.g.) ``tsk_node_table_set_max_metadata_length_increment(tables->nodes, 1024)``, which results in adding space for 1024 additional entries each time we run out of space in the ragged column. (:user:`benjeffery`, :issue:`1703`, :pr:`1709`) - Support for compiling the C library on Windows using msys2 (:user:`jeromekelleher`, :pr:`1742`). - Add ``time_units`` to ``tsk_table_collection_t`` to describe the units of the time dimension of the tree sequence. This is then used to geerate an error if ``time_units`` is ``uncalibrated`` when using the branch lengths in statistics. (:user:`benjeffery`, :issue:`1644`, :pr:`1760`) - Add the ``TSK_LOAD_SKIP_TABLES`` option to load just the top-level information from a file. Also add the ``TSK_CMP_IGNORE_TABLES`` option to compare only the top-level information in two table collections. (:user:`clwgg`, :pr:`1882`, :issue:`1854`). - Add reference sequence. (:user:`jeromekelleher`, :user:`benjeffery`, :issue:`146`, :pr:`1911`, :pr:`1944`, :pr:`1911`) - Add the ``TSK_LOAD_SKIP_REFERENCE_SEQUENCE`` option to load a table collection without the reference sequence. Also add the TSK_CMP_IGNORE_REFERENCE_SEQUENCE option to compare two table collections without comparing their reference sequence. (:user:`clwgg`, :pr:`2019`, :issue:`1971`). - Add a "virtual root" to ``Tree`` arrays such as ``left_sib``, ``right_child`` etc. The virtual root is appended to each array, has all real roots as its children, but is not the parent of any node. Simplifies traversal algorithms. (:user:`jeromekelleher`, :issue:`1691`, :pr:`1704`) - Add ``num_edges`` to ``tsk_tree_t`` to count the edges that define the topology of the tree. (:user:`jeromekelleher`, :pr:`1704`) - Add the ``tsk_tree_get_size_bound`` function which returns an upper bound on the number of nodes reachable from the roots of a tree. Useful for tree stack allocations (:user:`jeromekelleher`, :pr:`1704`). - Add ``MetadataSchema.permissive_json`` for an easy way to get the simplest schema. ---------------------- [0.99.14] - 2021-09-03 ---------------------- **Breaking changes** - 64 bits are now used to store the sizes of ragged table columns such as metadata, allowing them to hold more data. As such ``tsk_size_t`` is now 64 bits wide. This change is fully backwards and forwards compatible for all tree-sequences whose ragged column sizes fit into 32 bits. New tree-sequences with large offset arrays that require 64 bits will fail to load in previous versions with error ``TSK_ERR_BAD_COLUMN_TYPE``. (:user:`jeromekelleher`, :issue:`343`, :issue:`1527`, :issue:`1528`, :issue:`1530`, :issue:`1554`, :issue:`1573`, :issue:`1589`,:issue:`1598`,:issue:`1628`, :pr:`1571`, :pr:`1579`, :pr:`1585`, :pr:`1590`, :pr:`1602`, :pr:`1618`, :pr:`1620`, :pr:`1652`). **Features** - Add `tsk_X_table_update_row` methods which allow modifying single rows of tables (:user:`jeromekelleher`, :issue:`1545`, :pr:`1552`). ---------------------- [0.99.13] - 2021-07-08 ---------------------- **Fixes** - Fix segfault when very large columns overflow (:user:`bhaller`, :user:`benjeffery`, :issue:`1509`, :pr:`1511`). ---------------------- [0.99.12] - 2021-05-14 ---------------------- **Breaking changes** - Removed ``TSK_NO_BUILD_INDEXES``. Not building indexes is now the default behaviour of `tsk_table_collection_dump` and related functions. (:user:`molpopgen`, :issue:`1327`, :pr:`1337`). **Features** - Add ``tsk_*_table_extend`` methods to append to a table from another (:user:`benjeffery`, :issue:`1271`, :pr:`1287`). **Fixes** ---------------------- [0.99.11] - 2021-03-16 ---------------------- **Features** - Add ``parents`` to the individual table to enable recording of pedigrees (:user:`ivan-krukov`, :user:`benjeffery`, :issue:`852`, :pr:`1125`, :pr:`866`, :pr:`1153`, :pr:`1177`, :pr:`1199`). - Added a ``tsk_table_collection_canonicalise`` method, that allows checking for equality between tables that are equivalent up to reordering (:user:`petrelharp`, :user:`mufernando`, :pr:`1108`). - Removed a previous requirement on ``tsk_table_collection_union``, allowing for unioning of new information both above and below shared history (:user:`petrelharp`, :user:`mufernando`, :pr:`1108`). - Support migrations in tsk_table_collection_sort. (:user:`jeromekelleher`, :issue:`22`, :issue:`117`, :pr:`1131`). **Breaking changes** - Method ``tsk_individual_table_add_row`` has an extra arguments ``parents`` and ``parents_length``. - Add an ``options`` argument to ``tsk_table_collection_subset`` (:user:`petrelharp`, :pr:`1108`), to allow for retaining the order of populations. - Mutation error codes have changed **Changes** - Allow mutations that have the same derived state as their parent mutation. (:user:`benjeffery`, :issue:`1180`, :pr:`1233`) - File minor version change to support individual parents ---------------------- [0.99.10] - 2021-01-25 ---------------------- Minor bugfix on internal APIs --------------------- [0.99.9] - 2021-01-22 --------------------- **Features** - Add ``TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS`` flag to simplify, which allows the user to keep unary nodes only if they belong to a tabled individual. This is useful for simplification in forwards simulations (:user:`hyanwong`, :issue:`1113`, :pr:`1119`). --------------------- [0.99.8] - 2020-11-27 --------------------- **Features** - Add ``tsk_treeseq_genetic_relatedness`` for calculating genetic relatedness between pairs of sets of nodes (:user:`brieuclehmann`, :issue:`1021`, :pr:`1023`, :issue:`974`, :issue:`973`, :pr:`898`). - Exposed ``tsk_table_collection_set_indexes`` to the API (:user:`benjeffery`, :issue:`870`, :pr:`921`). **Breaking changes** - Added an ``options`` argument to ``tsk_table_collection_equals`` and table equality methods to allow for more flexible equality criteria (e.g., ignore top-level metadata and schema or provenance tables). Existing code should add an extra final parameter ``0`` to retain the current behaviour (:user:`mufernando`, :user:`jeromekelleher`, :issue:`896`, :pr:`897`, :issue:`913`, :pr:`917`). - Changed default behaviour of ``tsk_table_collection_clear`` to not clear provenances and added ``options`` argument to optionally clear provenances and schemas (:user:`benjeffery`, :issue:`929`, :pr:`1001`). - Renamed ``ts.trait_regression`` to ``ts.trait_linear_model``. --------------------- [0.99.7] - 2020-09-29 --------------------- - Added ``TSK_INCLUDE_TERMINAL`` option to ``tsk_diff_iter_init`` to output the last edges at the end of a tree sequence (:user:`hyanwong`, :issue:`783`, :pr:`787`). - Added ``tsk_bug_assert`` for assertions that should be compiled into release binaries (:user:`benjeffery`, :pr:`860`). --------------------- [0.99.6] - 2020-09-04 --------------------- **Bugfixes** - :issue:`823` - Fix mutation time error when using ``tsk_table_collection_simplify`` with ``TSK_SIMPLIFY_KEEP_INPUT_ROOTS`` (:user:`petrelharp`, :pr:`823`). --------------------- [0.99.5] - 2020-08-27 --------------------- **Breaking changes** - The macro ``TSK_IMPUTE_MISSING_DATA`` is renamed to ``TSK_ISOLATED_NOT_MISSING`` (:user:`benjeffery`, :issue:`716`, :pr:`794`) **New features** - Add a ``TSK_SIMPLIFY_KEEP_INPUT_ROOTS`` option to simplify which, if enabled, adds edges from the MRCAs of samples in the simplified tree sequence back to the roots in the input tree sequence (:user:`jeromekelleher`, :issue:`775`, :pr:`782`). **Bugfixes** - :issue:`777` - Mutations over isolated samples were incorrectly decoded as missing data. (:user:`jeromekelleher`, :pr:`778`) - :issue:`776` - Fix a segfault when a partial list of samples was provided to the ``variants`` iterator. (:user:`jeromekelleher`, :pr:`778`) --------------------- [0.99.4] - 2020-08-12 --------------------- **Note** - The ``TSK_VERSION_PATCH`` macro was incorrectly set to ``4`` for 0.99.3, so both 0.99.4 and 0.99.3 have the same value. **Changes** - Mutation times can be a mixture of known and unknown as long as for each individual site they are either all known or all unknown (:user:`benjeffery`, :pr:`761`). **Bugfixes** - Fix for including core.h under C++ (:user:`petrelharp`, :pr:`755`). --------------------- [0.99.3] - 2020-07-27 --------------------- **Breaking changes** - ``tsk_mutation_table_add_row`` has an extra ``time`` argument. If the time is unknown ``TSK_UNKNOWN_TIME`` should be passed. (:user:`benjeffery`, :pr:`672`) - Change genotypes from unsigned to signed to accommodate missing data (see :issue:`144` for discussion). This only affects users of the ``tsk_vargen_t`` class. Genotypes are now stored as int8_t and int16_t types rather than the former unsigned types. The field names in the genotypes union of the ``tsk_variant_t`` struct returned by ``tsk_vargen_next`` have been renamed to ``i8`` and ``i16`` accordingly; care should be taken when updating client code to ensure that types are correct. The number of distinct alleles supported by 8 bit genotypes has therefore dropped from 255 to 127, with a similar reduction for 16 bit genotypes. - Change the ``tsk_vargen_init`` method to take an extra parameter ``alleles``. To keep the current behaviour, set this parameter to NULL. - Edges can now have metadata. Hence edge methods now take two extra arguments: metadata and metadata length. The file format has also changed to accommodate this, but is backwards compatible. Edge metadata can be disabled for a table collection with the TSK_NO_EDGE_METADATA flag. (:user:`benjeffery`, :pr:`496`, :pr:`712`) - Migrations can now have metadata. Hence migration methods now take two extra arguments: metadata and metadata length. The file format has also changed to accommodate this, but is backwards compatible. (:user:`benjeffery`, :pr:`505`) - The text dump of tables with metadata now includes the metadata schema as a header. (:user:`benjeffery`, :pr:`493`) - Bad tree topologies are detected earlier, so that it is no longer possible to create a tsk_treeseq_t object which contains a parent with contradictory children on an interval. Previously an error occured when some operation building the trees was attempted (:user:`jeromekelleher`, :pr:`709`). **New features** - New methods to perform set operations on table collections. ``tsk_table_collection_subset`` subsets and reorders table collections by nodes (:user:`mufernando`, :user:`petrelharp`, :pr:`663`, :pr:`690`). ``tsk_table_collection_union`` forms the node-wise union of two table collections (:user:`mufernando`, :user:`petrelharp`, :issue:`381`, :pr:`623`). - Mutations now have an optional double-precision floating-point ``time`` column. If not specified, this defaults to a particular NaN value (``TSK_UNKNOWN_TIME``) indicating that the time is unknown. For a tree sequence to be considered valid it must meet new criteria for mutation times, see :ref:`sec_mutation_requirements`. Add ``tsk_table_collection_compute_mutation_times`` and new flag to ``tsk_table_collection_check_integrity``:``TSK_CHECK_MUTATION_TIME``. Table sorting orders mutations by non-increasing time per-site, which is also a requirement for a valid tree sequence. (:user:`benjeffery`, :pr:`672`) - Add ``metadata`` and ``metadata_schema`` fields to table collection, with accessors on tree sequence. These store arbitrary bytes and are optional in the file format. (:user: `benjeffery`, :pr:`641`) - Add the ``TSK_SIMPLIFY_KEEP_UNARY`` option to simplify (:user:`gtsambos`). See :issue:`1` and :pr:`143`. - Add a ``set_root_threshold`` option to tsk_tree_t which allows us to set the number of samples a node must be an ancestor of to be considered a root (:pr:`462`). - Change the semantics of tsk_tree_t so that sample counts are always computed, and add a new ``TSK_NO_SAMPLE_COUNTS`` option to turn this off (:pr:`462`). - Tables with metadata now have an optional `metadata_schema` field that can contain arbitrary bytes. (:user:`benjeffery`, :pr:`493`) - Tables loaded from a file can now be edited in the same way as any other table collection (:user:`jeromekelleher`, :issue:`536`, :pr:`530`. - Support for reading/writing to arbitrary file streams with the loadf/dumpf variants for tree sequence and table collection load/dump (:user:`jeromekelleher`, :user:`grahamgower`, :issue:`565`, :pr:`599`). - Add low-level sorting API and ``TSK_NO_CHECK_INTEGRITY`` flag (:user:`jeromekelleher`, :pr:`627`, :issue:`626`). - Add extension of Kendall-Colijn tree distance metric for tree sequences computed by ``tsk_treeseq_kc_distance`` (:user:`daniel-goldstein`, :pr:`548`) **Deprecated** - The ``TSK_SAMPLE_COUNTS`` options is now ignored and will print out a warning if used (:pr:`462`). --------------------- [0.99.2] - 2019-03-27 --------------------- Bugfix release. Changes: - Fix incorrect errors on tbl_collection_dump (#132) - Catch table overflows (#157) --------------------- [0.99.1] - 2019-01-24 --------------------- Refinements to the C API as we move towards 1.0.0. Changes: - Change the ``_tbl_`` abbreviation to ``_table_`` to improve readability. Hence, we now have, e.g., ``tsk_node_table_t`` etc. - Change ``tsk_tbl_size_t`` to ``tsk_size_t``. - Standardise public API to use ``tsk_size_t`` and ``tsk_id_t`` as appropriate. - Add ``tsk_flags_t`` typedef and consistently use this as the type used to encode bitwise flags. To avoid confusion, functions now have an ``options`` parameter. - Rename ``tsk_table_collection_position_t`` to ``tsk_bookmark_t``. - Rename ``tsk_table_collection_reset_position`` to ``tsk_table_collection_truncate`` and ``tsk_table_collection_record_position`` to ``tsk_table_collection_record_num_rows``. - Generalise ``tsk_table_collection_sort`` to take a bookmark as start argument. - Relax restriction that nodes in the ``samples`` argument to simplify must currently be marked as samples. (https://github.com/tskit-dev/tskit/issues/72) - Allow ``tsk_table_collection_simplify`` to take a NULL samples argument to specify "all samples in the current tables". - Add support for building as a meson subproject. --------------------- [0.99.0] - 2019-01-14 --------------------- Initial alpha version of the tskit C API tagged. Version 0.99.x represents the series of releases leading to version 1.0.0 which will be the first stable release. After 1.0.0, semver rules regarding API/ABI breakage will apply; however, in the 0.99.x series arbitrary changes may happen. -------------------- [0.0.0] - 2019-01-10 -------------------- Initial extraction of tskit code from msprime. Relicense to MIT. Code copied at hash 29921408661d5fe0b1a82b1ca302a8b87510fd23 ================================================ FILE: c/VERSION.txt ================================================ 1.3.1 ================================================ FILE: c/examples/Makefile ================================================ # Simple Makefile for building examples. # This will build the examples in the current directory by compiling in the # full tskit source into each of the examples. This is *not* recommended for # real projects! # # To use, type "make" in the this directory. If you have GSL installed you # should then get two example programs built. # # **Note**: This repo uses git submodules, and these must be checked out # correctly for this makefile to work, e.g.: # # $ git clone git@github.com:tskit-dev/tskit.git --recurse-submodules # # See the documentation (https://tskit.dev/tskit/docs/stable/c-api.html) # for more details on how to use the C API, and the tskit build examples # repo (https://github.com/tskit-dev/tskit-build-examples) for examples # of how to set up a production-ready build with tskit. # CFLAGS=-I../ -I../subprojects/kastore TSKIT_SOURCE=../tskit/*.c ../subprojects/kastore/kastore.c targets = api_structure error_handling \ haploid_wright_fisher streaming \ tree_iteration tree_traversal \ take_ownership \ json_struct_metadata all: $(targets) $(targets): %: %.c ${CC} ${CFLAGS} -o $@ $< ${TSKIT_SOURCE} -lm clean: rm -f $(targets) ================================================ FILE: c/examples/api_structure.c ================================================ #include #include #include #define check_tsk_error(val) \ if (val < 0) { \ fprintf(stderr, "line %d: %s", __LINE__, tsk_strerror(val)); \ exit(EXIT_FAILURE); \ } int main(int argc, char **argv) { int j, ret; tsk_edge_table_t edges; ret = tsk_edge_table_init(&edges, 0); check_tsk_error(ret); for (j = 0; j < 5; j++) { ret = tsk_edge_table_add_row(&edges, 0, 1, j + 1, j, NULL, 0); check_tsk_error(ret); } tsk_edge_table_print_state(&edges, stdout); tsk_edge_table_free(&edges); return EXIT_SUCCESS; } ================================================ FILE: c/examples/cpp_sorting_example.cpp ================================================ #include #include #include #include #include #include #include #include static void handle_tskit_return_code(int code) { if (code != 0) { std::ostringstream o; o << tsk_strerror(code); throw std::runtime_error(o.str()); } } struct edge_plus_time { double time; tsk_id_t parent, child; double left, right; }; int sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start) { if (sorter->tables->edges.metadata_length != 0) { throw std::invalid_argument( "the sorter does not currently handle edge metadata"); } if (start != 0) { throw std::invalid_argument("the sorter requires start==0"); } std::vector temp; temp.reserve(static_cast(sorter->tables->edges.num_rows)); auto edges = &sorter->tables->edges; auto nodes = &sorter->tables->nodes; for (tsk_size_t i = 0; i < sorter->tables->edges.num_rows; ++i) { temp.push_back(edge_plus_time{ nodes->time[edges->parent[i]], edges->parent[i], edges->child[i], edges->left[i], edges->right[i] }); } std::sort(begin(temp), end(temp), [](const edge_plus_time &lhs, const edge_plus_time &rhs) { if (lhs.time == rhs.time) { if (lhs.parent == rhs.parent) { if (lhs.child == rhs.child) { return lhs.left < rhs.left; } return lhs.child < rhs.child; } return lhs.parent < rhs.parent; } return lhs.time < rhs.time; }); for (std::size_t i = 0; i < temp.size(); ++i) { edges->left[i] = temp[i].left; edges->right[i] = temp[i].right; edges->parent[i] = temp[i].parent; edges->child[i] = temp[i].child; } return 0; } int main(int argc, char **argv) { if (argc != 3) { std::cerr << "Usage: " << argv[0] << " input.trees output.trees\n"; std::exit(0); } const char *infile = argv[1]; const char *outfile = argv[2]; tsk_table_collection_t tables; auto ret = tsk_table_collection_load(&tables, infile, 0); handle_tskit_return_code(ret); tsk_table_sorter_t sorter; ret = tsk_table_sorter_init(&sorter, &tables, 0); handle_tskit_return_code(ret); sorter.sort_edges = sort_edges; try { ret = tsk_table_sorter_run(&sorter, NULL); } catch (std::exception &e) { std::cerr << e.what() << '\n'; std::exit(1); } handle_tskit_return_code(ret); ret = tsk_table_collection_dump(&tables, outfile, 0); handle_tskit_return_code(ret); ret = tsk_table_collection_free(&tables); handle_tskit_return_code(ret); } ================================================ FILE: c/examples/error_handling.c ================================================ #include #include #include int main(int argc, char **argv) { int ret; tsk_treeseq_t ts; if (argc != 2) { fprintf(stderr, "usage: "); exit(EXIT_FAILURE); } ret = tsk_treeseq_load(&ts, argv[1], 0); if (ret < 0) { /* Error condition. Free and exit */ tsk_treeseq_free(&ts); fprintf(stderr, "%s", tsk_strerror(ret)); exit(EXIT_FAILURE); } printf("Loaded tree sequence with %lld nodes and %lld edges from %s\n", (long long) tsk_treeseq_get_num_nodes(&ts), (long long) tsk_treeseq_get_num_edges(&ts), argv[1]); tsk_treeseq_free(&ts); return EXIT_SUCCESS; } ================================================ FILE: c/examples/haploid_wright_fisher.c ================================================ #include #include #include #include #include #define check_tsk_error(val) \ if (val < 0) { \ errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val)); \ } void simulate(tsk_table_collection_t *tables, int N, int T, int simplify_interval) { tsk_id_t *buffer, *parents, *children, child, left_parent, right_parent; double breakpoint; int ret, j, t, b; assert(simplify_interval != 0); // leads to division by zero buffer = malloc(2 * N * sizeof(tsk_id_t)); if (buffer == NULL) { errx(EXIT_FAILURE, "Out of memory"); } tables->sequence_length = 1.0; parents = buffer; for (j = 0; j < N; j++) { parents[j] = tsk_node_table_add_row(&tables->nodes, 0, T, TSK_NULL, TSK_NULL, NULL, 0); check_tsk_error(parents[j]); } b = 0; for (t = T - 1; t >= 0; t--) { /* Alternate between using the first and last N values in the buffer */ parents = buffer + (b * N); b = (b + 1) % 2; children = buffer + (b * N); for (j = 0; j < N; j++) { child = tsk_node_table_add_row( &tables->nodes, 0, t, TSK_NULL, TSK_NULL, NULL, 0); check_tsk_error(child); /* NOTE: the use of rand() is discouraged for * research code and proper random number generator * libraries should be preferred. */ left_parent = parents[(size_t) ((rand() / (1. + RAND_MAX)) * N)]; right_parent = parents[(size_t) ((rand() / (1. + RAND_MAX)) * N)]; do { breakpoint = rand() / (1. + RAND_MAX); } while (breakpoint == 0); /* tiny proba of breakpoint being 0 */ ret = tsk_edge_table_add_row( &tables->edges, 0, breakpoint, left_parent, child, NULL, 0); check_tsk_error(ret); ret = tsk_edge_table_add_row( &tables->edges, breakpoint, 1, right_parent, child, NULL, 0); check_tsk_error(ret); children[j] = child; } if (t % simplify_interval == 0) { printf("Simplify at generation %lld: (%lld nodes %lld edges)", (long long) t, (long long) tables->nodes.num_rows, (long long) tables->edges.num_rows); /* Note: Edges must be sorted for simplify to work, and we use a brute force * approach of sorting each time here for simplicity. This is inefficient. */ ret = tsk_table_collection_sort(tables, NULL, 0); check_tsk_error(ret); ret = tsk_table_collection_simplify(tables, children, N, 0, NULL); check_tsk_error(ret); printf(" -> (%lld nodes %lld edges)\n", (long long) tables->nodes.num_rows, (long long) tables->edges.num_rows); for (j = 0; j < N; j++) { children[j] = j; } } } free(buffer); } int main(int argc, char **argv) { int ret; tsk_table_collection_t tables; if (argc != 6) { errx(EXIT_FAILURE, "usage: N T simplify-interval output-file seed"); } ret = tsk_table_collection_init(&tables, 0); check_tsk_error(ret); srand((unsigned) atoi(argv[5])); simulate(&tables, atoi(argv[1]), atoi(argv[2]), atoi(argv[3])); /* Sort and index so that the result can be opened as a tree sequence */ ret = tsk_table_collection_sort(&tables, NULL, 0); check_tsk_error(ret); ret = tsk_table_collection_build_index(&tables, 0); check_tsk_error(ret); ret = tsk_table_collection_dump(&tables, argv[4], 0); check_tsk_error(ret); tsk_table_collection_free(&tables); return 0; } ================================================ FILE: c/examples/json_struct_metadata.c ================================================ #include #include #include #include #include // these are properties of the ``json+struct`` codec, documented in tskit #define JSON_STRUCT_HEADER_SIZE 21 const uint8_t json_struct_codec_magic[4] = { 'J', 'B', 'L', 'B' }; const uint8_t json_struct_codec_version = 1; // little-endian read of a uint64_t from an address static uint64_t load_u64_le(const uint8_t *p) { uint64_t value = (uint64_t) p[0]; value |= (uint64_t) p[1] << 8; value |= (uint64_t) p[2] << 16; value |= (uint64_t) p[3] << 24; value |= (uint64_t) p[4] << 32; value |= (uint64_t) p[5] << 40; value |= (uint64_t) p[6] << 48; value |= (uint64_t) p[7] << 56; return value; } // little-endian write of a uint64_t to an address static void set_u64_le(uint8_t *dest, uint64_t value) { dest[0] = (uint8_t) (value & 0xFF); dest[1] = (uint8_t) ((value >> 8) & 0xFF); dest[2] = (uint8_t) ((value >> 16) & 0xFF); dest[3] = (uint8_t) ((value >> 24) & 0xFF); dest[4] = (uint8_t) ((value >> 32) & 0xFF); dest[5] = (uint8_t) ((value >> 40) & 0xFF); dest[6] = (uint8_t) ((value >> 48) & 0xFF); dest[7] = (uint8_t) ((value >> 56) & 0xFF); } // Extract the json and binary payloads from the `json+struct` codec data buffer. // Note that the output pointers `json` and `binary` reference memory // inside the `metadata` buffer passed in. void json_struct_codec_get_components(uint8_t *metadata, tsk_size_t metadata_length, uint8_t **json, tsk_size_t *json_length, uint8_t **binary, tsk_size_t *binary_length) { // check the structure of the codec header and the sizes it specifies if (metadata == NULL || json == NULL || json_length == NULL || binary == NULL || binary_length == NULL) errx(EXIT_FAILURE, "bad parameter value."); if (metadata_length < JSON_STRUCT_HEADER_SIZE) errx(EXIT_FAILURE, "metadata truncated."); if (memcmp(metadata, json_struct_codec_magic, sizeof(json_struct_codec_magic)) != 0) errx(EXIT_FAILURE, "bad magic bytes."); uint8_t version = metadata[4]; if (version != json_struct_codec_version) errx(EXIT_FAILURE, "bad version number."); uint64_t json_length_u64 = load_u64_le(metadata + 5); uint64_t binary_length_u64 = load_u64_le(metadata + 13); if (json_length_u64 > UINT64_MAX - (uint64_t) JSON_STRUCT_HEADER_SIZE) errx(EXIT_FAILURE, "invalid length."); // determine the number of padding bytes and do more safety checks uint64_t length = (uint64_t) JSON_STRUCT_HEADER_SIZE + json_length_u64; uint64_t padding_length = (8 - (length & 0x07)) % 8; if (padding_length > UINT64_MAX - length) errx(EXIT_FAILURE, "invalid length."); length += padding_length; if (binary_length_u64 > UINT64_MAX - length) errx(EXIT_FAILURE, "invalid length."); length += binary_length_u64; if ((uint64_t) metadata_length != length) errx(EXIT_FAILURE, "unexpected size."); uint8_t *padding_start = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64; for (uint64_t j = 0; j < padding_length; ++j) if (*(padding_start + j) != 0) errx(EXIT_FAILURE, "padding bytes are nonzero."); // the structure of the codec data seems valid; return components *json = metadata + JSON_STRUCT_HEADER_SIZE; *json_length = (tsk_size_t) json_length_u64; *binary = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64 + padding_length; *binary_length = (tsk_size_t) binary_length_u64; } // malloc and return a data buffer for the `json+struct` codec // that contains the given components void json_struct_codec_create_buffer(const uint8_t *json, tsk_size_t json_length, const uint8_t *binary, tsk_size_t binary_length, uint8_t **buffer, tsk_size_t *buffer_length) { // figure out the total length of the codec's data and allocate the buffer for it tsk_size_t header_length = JSON_STRUCT_HEADER_SIZE; tsk_size_t padding_length = (8 - ((header_length + json_length) & 0x07)) % 8; tsk_size_t total_length = header_length + json_length + padding_length + binary_length; uint8_t *bytes = malloc(total_length); if (!bytes) errx(EXIT_FAILURE, "memory for buffer could not be allocated."); // then set up the bytes for the codec header memcpy(bytes, json_struct_codec_magic, 4); bytes[4] = json_struct_codec_version; set_u64_le(bytes + 5, (uint64_t) json_length); set_u64_le(bytes + 13, (uint64_t) binary_length); // copy in the JSON and binary data, separated by the padding bytes; the goal of the // padding bytes is to ensure that the binary data is 8-byte-aligned relative to the // start of the buffer memcpy(bytes + header_length, json, json_length); memset(bytes + header_length + json_length, 0, padding_length); memcpy(bytes + header_length + json_length + padding_length, binary, binary_length); // return the buffer and its length; the caller takes ownership of the buffer *buffer = bytes; *buffer_length = total_length; } int main(int argc, char **argv) { // we start with JSON and binary payloads that we encode into a new buffer // note that the JSON payload does not have to end with a trailing NULL const char json_payload[] = { '{', '"', 'a', '"', ':', '1', '}' }; const uint8_t binary_payload[] = { 0x01, 0x02, 0x03, 0x04 }; uint8_t *metadata; tsk_size_t metadata_length; json_struct_codec_create_buffer((const uint8_t *) json_payload, sizeof(json_payload), binary_payload, sizeof(binary_payload), &metadata, &metadata_length); // then we decode that buffer to recover the json and binary data uint8_t *decoded_json, *decoded_binary; tsk_size_t decoded_json_length, decoded_binary_length; json_struct_codec_get_components(metadata, metadata_length, &decoded_json, &decoded_json_length, &decoded_binary, &decoded_binary_length); // print the recovered data to demonstrate that the round-trip worked // note that the JSON data is not NULL-terminated unless you put a NULL there! printf("JSON: %.*s\n", (int) decoded_json_length, decoded_json); printf("Binary data:"); for (tsk_size_t j = 0; j < decoded_binary_length; j++) printf(" %#04x", decoded_binary[j]); printf("\n"); free(metadata); return EXIT_SUCCESS; } ================================================ FILE: c/examples/multichrom_wright_fisher.c ================================================ #include #include #include #include #include #include #include #define check_tsk_error(val) \ if (val < 0) { \ errx(EXIT_FAILURE, "line %d: %s\n", __LINE__, tsk_strerror(val)); \ } static void init_tables(tsk_table_collection_t *tcs, int num_chroms) { int j, ret; for (j = 0; j < num_chroms; j++) { ret = tsk_table_collection_init(&tcs[j], 0); check_tsk_error(ret); if (j > 0) { tsk_node_table_free(&tcs[j].nodes); } } } static void free_tables(tsk_table_collection_t *tcs, int num_chroms) { int j; for (j = 0; j < num_chroms; j++) { if (j > 0) { /* Must not double free node table columns. */ memset(&tcs[j].nodes, 0, sizeof(tcs[j].nodes)); } tsk_table_collection_free(&tcs[j]); } } static void join_tables(tsk_table_collection_t *tcs, int num_chroms) { int j, ret; for (j = 1; j < num_chroms; j++) { ret = tsk_edge_table_extend( &tcs[0].edges, &tcs[j].edges, tcs[j].edges.num_rows, NULL, 0); check_tsk_error(ret); } /* Get all the squashable edges next to each other */ ret = tsk_table_collection_sort(&tcs[0], NULL, 0); check_tsk_error(ret); ret = tsk_edge_table_squash(&tcs[0].edges); check_tsk_error(ret); /* We need to sort again after squash */ ret = tsk_table_collection_sort(&tcs[0], NULL, 0); check_tsk_error(ret); ret = tsk_table_collection_build_index(&tcs[0], 0); check_tsk_error(ret); } struct chunk_work { int chunk; tsk_table_collection_t *tc; int *samples; int N; }; void * simplify_chunk(void *arg) { int ret; struct chunk_work *work = (struct chunk_work *) arg; tsk_size_t edges_before = work->tc->edges.num_rows; ret = tsk_table_collection_sort(work->tc, NULL, 0); check_tsk_error(ret); ret = tsk_table_collection_simplify(work->tc, work->samples, work->N, TSK_SIMPLIFY_NO_FILTER_NODES | TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS, NULL); check_tsk_error(ret); /* NOTE: this printf makes helgrind complain */ printf("\tchunk %d: %lld -> %lld\n", work->chunk, (long long) edges_before, (long long) work->tc->edges.num_rows); return NULL; } void sort_and_simplify_all(tsk_table_collection_t *tcs, int num_chroms, int *samples, int N) { int j, ret; struct chunk_work work[num_chroms]; pthread_t threads[num_chroms]; for (j = 1; j < num_chroms; j++) { tcs[j].nodes = tcs[0].nodes; } for (j = 0; j < num_chroms; j++) { work[j].chunk = j; work[j].tc = &tcs[j]; work[j].samples = samples; work[j].N = N; ret = pthread_create(&threads[j], NULL, simplify_chunk, (void *) &work[j]); if (ret != 0) { errx(EXIT_FAILURE, "Pthread create failed"); } /* simplify_chunk((void *) &work[j]); */ } for (j = 0; j < num_chroms; j++) { ret = pthread_join(threads[j], NULL); if (ret != 0) { errx(EXIT_FAILURE, "Pthread join failed"); } } } void simplify_tables(tsk_table_collection_t *tcs, int num_chroms, int *samples, int N) { int j, k, num_edges, ret; const tsk_size_t num_nodes = tcs[0].nodes.num_rows; tsk_bool_t *keep_nodes = malloc(num_nodes * sizeof(*keep_nodes)); tsk_id_t *node_id_map = malloc(num_nodes * sizeof(*node_id_map)); tsk_id_t *edge_child, *edge_parent; if (keep_nodes == NULL || node_id_map == NULL) { errx(EXIT_FAILURE, "Out of memory"); } printf("Simplify %lld nodes\n", (long long) tcs[0].nodes.num_rows); sort_and_simplify_all(tcs, num_chroms, samples, N); for (j = 0; j < num_nodes; j++) { keep_nodes[j] = false; tcs[0].nodes.flags[j] &= (~TSK_NODE_IS_SAMPLE); } for (j = 0; j < N; j++) { keep_nodes[samples[j]] = true; tcs[0].nodes.flags[samples[j]] |= TSK_NODE_IS_SAMPLE; } for (j = 0; j < num_chroms; j++) { edge_child = tcs[j].edges.child; edge_parent = tcs[j].edges.parent; num_edges = tcs[j].edges.num_rows; for (k = 0; k < num_edges; k++) { keep_nodes[edge_child[k]] = true; keep_nodes[edge_parent[k]] = true; } } tsk_node_table_keep_rows(&tcs[0].nodes, keep_nodes, 0, node_id_map); printf("\tdone: %lld nodes\n", (long long) tcs[0].nodes.num_rows); /* Remap node references */ for (j = 0; j < num_chroms; j++) { edge_child = tcs[j].edges.child; edge_parent = tcs[j].edges.parent; num_edges = tcs[j].edges.num_rows; for (k = 0; k < num_edges; k++) { edge_child[k] = node_id_map[edge_child[k]]; edge_parent[k] = node_id_map[edge_parent[k]]; } ret = tsk_table_collection_check_integrity(&tcs[j], 0); check_tsk_error(ret); } for (j = 0; j < N; j++) { samples[j] = node_id_map[samples[j]]; } free(keep_nodes); free(node_id_map); } void simulate( tsk_table_collection_t *tcs, int num_chroms, int N, int T, int simplify_interval) { tsk_id_t *buffer, *parents, *children, child, left_parent, right_parent; bool left_is_first; double chunk_left, chunk_right; int ret, j, t, b, k; assert(simplify_interval != 0); // leads to division by zero buffer = malloc(2 * N * sizeof(tsk_id_t)); if (buffer == NULL) { errx(EXIT_FAILURE, "Out of memory"); } for (k = 0; k < num_chroms; k++) { tcs[k].sequence_length = num_chroms; } parents = buffer; for (j = 0; j < N; j++) { parents[j] = tsk_node_table_add_row(&tcs[0].nodes, 0, T, TSK_NULL, TSK_NULL, NULL, 0); check_tsk_error(parents[j]); } b = 0; for (t = T - 1; t >= 0; t--) { /* Alternate between using the first and last N values in the buffer */ parents = buffer + (b * N); b = (b + 1) % 2; children = buffer + (b * N); for (j = 0; j < N; j++) { child = tsk_node_table_add_row( &tcs[0].nodes, 0, t, TSK_NULL, TSK_NULL, NULL, 0); check_tsk_error(child); /* NOTE: the use of rand() is discouraged for * research code and proper random number generator * libraries should be preferred. */ left_parent = parents[(size_t) ((rand() / (1. + RAND_MAX)) * N)]; right_parent = parents[(size_t) ((rand() / (1. + RAND_MAX)) * N)]; left_is_first = rand() < 0.5; chunk_left = 0.0; for (k = 0; k < num_chroms; k++) { chunk_right = chunk_left + rand() / (1. + RAND_MAX); /* a very tiny chance that right and left are equal */ if (chunk_right > chunk_left) { ret = tsk_edge_table_add_row(&tcs[k].edges, chunk_left, chunk_right, left_is_first ? left_parent : right_parent, child, NULL, 0); check_tsk_error(ret); } chunk_left += 1.0; if (chunk_right < chunk_left) { ret = tsk_edge_table_add_row(&tcs[k].edges, chunk_right, chunk_left, left_is_first ? right_parent : left_parent, child, NULL, 0); check_tsk_error(ret); } } children[j] = child; } if (t % simplify_interval == 0) { simplify_tables(tcs, num_chroms, children, N); } } /* Set the sample flags for final generation */ for (j = 0; j < N; j++) { tcs[0].nodes.flags[children[j]] = TSK_NODE_IS_SAMPLE; } free(buffer); } int main(int argc, char **argv) { int ret; int num_chroms; if (argc != 7) { errx(EXIT_FAILURE, "usage: N T simplify-interval output seed num-chroms"); } num_chroms = atoi(argv[6]); tsk_table_collection_t tcs[num_chroms]; srand((unsigned) atoi(argv[5])); init_tables(tcs, num_chroms); simulate(tcs, num_chroms, atoi(argv[1]), atoi(argv[2]), atoi(argv[3])); join_tables(tcs, num_chroms); ret = tsk_table_collection_dump(&tcs[0], argv[4], 0); check_tsk_error(ret); free_tables(tcs, num_chroms); return 0; } ================================================ FILE: c/examples/multichrom_wright_fisher_singlethreaded.c ================================================ #include #include #include #include #include #include #define check_tsk_error(val) \ if (val < 0) { \ errx(EXIT_FAILURE, "line %d: %s\n", __LINE__, tsk_strerror(val)); \ } void simulate( tsk_table_collection_t *tables, int num_chroms, int N, int T, int simplify_interval) { tsk_id_t *buffer, *parents, *children, child, left_parent, right_parent; bool left_is_first; double chunk_left, chunk_right; int ret, j, t, b, k; assert(simplify_interval != 0); // leads to division by zero buffer = malloc(2 * N * sizeof(tsk_id_t)); if (buffer == NULL) { errx(EXIT_FAILURE, "Out of memory"); } tables->sequence_length = num_chroms; parents = buffer; for (j = 0; j < N; j++) { parents[j] = tsk_node_table_add_row(&tables->nodes, 0, T, TSK_NULL, TSK_NULL, NULL, 0); check_tsk_error(parents[j]); } b = 0; for (t = T - 1; t >= 0; t--) { /* Alternate between using the first and last N values in the buffer */ parents = buffer + (b * N); b = (b + 1) % 2; children = buffer + (b * N); for (j = 0; j < N; j++) { child = tsk_node_table_add_row( &tables->nodes, 0, t, TSK_NULL, TSK_NULL, NULL, 0); check_tsk_error(child); /* NOTE: the use of rand() is discouraged for * research code and proper random number generator * libraries should be preferred. */ left_parent = parents[(size_t) ((rand() / (1. + RAND_MAX)) * N)]; right_parent = parents[(size_t) ((rand() / (1. + RAND_MAX)) * N)]; left_is_first = rand() < 0.5; chunk_left = 0.0; for (k = 0; k < num_chroms; k++) { chunk_right = chunk_left + rand() / (1. + RAND_MAX); /* a very tiny chance that right and left are equal */ if (chunk_right > chunk_left) { ret = tsk_edge_table_add_row(&tables->edges, chunk_left, chunk_right, left_is_first ? left_parent : right_parent, child, NULL, 0); check_tsk_error(ret); } chunk_left += 1.0; if (chunk_right < chunk_left) { ret = tsk_edge_table_add_row(&tables->edges, chunk_right, chunk_left, left_is_first ? right_parent : left_parent, child, NULL, 0); check_tsk_error(ret); } } children[j] = child; } if (t % simplify_interval == 0) { printf("Simplify at generation %lld: (%lld nodes %lld edges)", (long long) t, (long long) tables->nodes.num_rows, (long long) tables->edges.num_rows); /* Note: Edges must be sorted for simplify to work, and we use a brute force * approach of sorting each time here for simplicity. This is inefficient. */ ret = tsk_table_collection_sort(tables, NULL, 0); check_tsk_error(ret); ret = tsk_table_collection_simplify(tables, children, N, 0, NULL); check_tsk_error(ret); printf(" -> (%lld nodes %lld edges)\n", (long long) tables->nodes.num_rows, (long long) tables->edges.num_rows); for (j = 0; j < N; j++) { children[j] = j; } } } /* Set the sample flags for final generation */ for (j = 0; j < N; j++) { tables->nodes.flags[children[j]] = TSK_NODE_IS_SAMPLE; } free(buffer); } int main(int argc, char **argv) { int ret; tsk_table_collection_t tables; if (argc != 7) { errx(EXIT_FAILURE, "usage: N T simplify-interval output seed num-chroms"); } ret = tsk_table_collection_init(&tables, 0); check_tsk_error(ret); srand((unsigned) atoi(argv[5])); simulate(&tables, atoi(argv[6]), atoi(argv[1]), atoi(argv[2]), atoi(argv[3])); /* Sort and index so that the result can be opened as a tree sequence */ ret = tsk_table_collection_sort(&tables, NULL, 0); check_tsk_error(ret); ret = tsk_table_collection_build_index(&tables, 0); check_tsk_error(ret); ret = tsk_table_collection_dump(&tables, argv[4], 0); check_tsk_error(ret); tsk_table_collection_free(&tables); return 0; } ================================================ FILE: c/examples/streaming.c ================================================ #include #include #include #define check_tsk_error(val) \ if (val < 0) { \ fprintf(stderr, "Error: line %d: %s\n", __LINE__, tsk_strerror(val)); \ exit(EXIT_FAILURE); \ } int main(int argc, char **argv) { int ret; int j = 0; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); check_tsk_error(ret); while (true) { ret = tsk_table_collection_loadf(&tables, stdin, TSK_NO_INIT); if (ret == TSK_ERR_EOF) { break; } check_tsk_error(ret); fprintf(stderr, "Tree sequence %d had %lld mutations\n", j, (long long) tables.mutations.num_rows); ret = tsk_mutation_table_truncate(&tables.mutations, 0); check_tsk_error(ret); ret = tsk_table_collection_dumpf(&tables, stdout, 0); check_tsk_error(ret); j++; } tsk_table_collection_free(&tables); return EXIT_SUCCESS; } ================================================ FILE: c/examples/take_ownership.c ================================================ #include #include #include #include #define check_tsk_error(val) \ if (val < 0) { \ errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val)); \ } int main(int argc, char **argv) { tsk_table_collection_t *tables; tsk_treeseq_t treeseq; int rv; tables = malloc(sizeof(*tables)); rv = tsk_table_collection_init(tables, 0); check_tsk_error(rv); /* NOTE: you must set sequence length AFTER initialization */ tables->sequence_length = 1.0; /* Do your regular table operations */ rv = tsk_node_table_add_row(&tables->nodes, 0, 0.0, -1, -1, NULL, 0); check_tsk_error(rv); /* Initalize the tree sequence, transferring all responsibility * for the table collection's memory managment */ rv = tsk_treeseq_init( &treeseq, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TAKE_OWNERSHIP); check_tsk_error(rv); /* WARNING: calling tsk_table_collection_free is now a memory error! */ tsk_treeseq_free(&treeseq); } ================================================ FILE: c/examples/tree_iteration.c ================================================ #include #include #include #include #define check_tsk_error(val) \ if (val < 0) { \ errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val)); \ } int main(int argc, char **argv) { int ret; tsk_treeseq_t ts; tsk_tree_t tree; if (argc != 2) { errx(EXIT_FAILURE, "usage: "); } ret = tsk_treeseq_load(&ts, argv[1], 0); check_tsk_error(ret); ret = tsk_tree_init(&tree, &ts, 0); check_tsk_error(ret); printf("Iterate forwards\n"); for (ret = tsk_tree_first(&tree); ret == TSK_TREE_OK; ret = tsk_tree_next(&tree)) { printf("\ttree %lld has %lld roots\n", (long long) tree.index, (long long) tsk_tree_get_num_roots(&tree)); } check_tsk_error(ret); printf("Iterate backwards\n"); for (ret = tsk_tree_last(&tree); ret == TSK_TREE_OK; ret = tsk_tree_prev(&tree)) { printf("\ttree %lld has %lld roots\n", (long long) tree.index, (long long) tsk_tree_get_num_roots(&tree)); } check_tsk_error(ret); tsk_tree_free(&tree); tsk_treeseq_free(&ts); return 0; } ================================================ FILE: c/examples/tree_traversal.c ================================================ #include #include #include #include #define check_tsk_error(val) \ if (val < 0) { \ errx(EXIT_FAILURE, "line %d: %s", __LINE__, tsk_strerror(val)); \ } static void traverse_standard(const tsk_tree_t *tree) { int ret; tsk_size_t num_nodes, j; tsk_id_t *nodes = malloc(tsk_tree_get_size_bound(tree) * sizeof(*nodes)); if (nodes == NULL) { errx(EXIT_FAILURE, "Out of memory"); } ret = tsk_tree_preorder(tree, nodes, &num_nodes); check_tsk_error(ret); for (j = 0; j < num_nodes; j++) { printf("Visit preorder %lld\n", (long long) nodes[j]); } ret = tsk_tree_postorder(tree, nodes, &num_nodes); check_tsk_error(ret); for (j = 0; j < num_nodes; j++) { printf("Visit postorder %lld\n", (long long) nodes[j]); } free(nodes); } static void _traverse(const tsk_tree_t *tree, tsk_id_t u, int depth) { tsk_id_t v; int j; for (j = 0; j < depth; j++) { printf(" "); } printf("Visit recursive %lld\n", (long long) u); for (v = tree->left_child[u]; v != TSK_NULL; v = tree->right_sib[v]) { _traverse(tree, v, depth + 1); } } static void traverse_recursive(const tsk_tree_t *tree) { _traverse(tree, tree->virtual_root, -1); } static void traverse_stack(const tsk_tree_t *tree) { int stack_top; tsk_id_t u, v; tsk_id_t *stack = malloc(tsk_tree_get_size_bound(tree) * sizeof(*stack)); if (stack == NULL) { errx(EXIT_FAILURE, "Out of memory"); } stack_top = 0; stack[stack_top] = tree->virtual_root; while (stack_top >= 0) { u = stack[stack_top]; stack_top--; printf("Visit stack %lld\n", (long long) u); /* Put nodes on the stack right-to-left, so we visit in left-to-right */ for (v = tree->right_child[u]; v != TSK_NULL; v = tree->left_sib[v]) { stack_top++; stack[stack_top] = v; } } free(stack); } static void traverse_upwards(const tsk_tree_t *tree) { const tsk_id_t *samples = tsk_treeseq_get_samples(tree->tree_sequence); tsk_size_t num_samples = tsk_treeseq_get_num_samples(tree->tree_sequence); tsk_size_t j; tsk_id_t u; for (j = 0; j < num_samples; j++) { u = samples[j]; while (u != TSK_NULL) { printf("Visit upwards: %lld\n", (long long) u); u = tree->parent[u]; } } } int main(int argc, char **argv) { int ret; tsk_treeseq_t ts; tsk_tree_t tree; if (argc != 2) { errx(EXIT_FAILURE, "usage: "); } ret = tsk_treeseq_load(&ts, argv[1], 0); check_tsk_error(ret); ret = tsk_tree_init(&tree, &ts, 0); check_tsk_error(ret); ret = tsk_tree_first(&tree); check_tsk_error(ret); traverse_standard(&tree); traverse_recursive(&tree); traverse_stack(&tree); traverse_upwards(&tree); tsk_tree_free(&tree); tsk_treeseq_free(&ts); return 0; } ================================================ FILE: c/meson.build ================================================ project('tskit', ['c', 'cpp'], version: files('VERSION.txt'), default_options: ['c_std=c99', 'cpp_std=c++11'] ) debug_c_args = [] if get_option('buildtype').startswith('debug') debug_c_args = ['-DTSK_TRACE_ERRORS'] endif kastore_proj = subproject('kastore') kastore_dep = kastore_proj.get_variable('kastore_dep') kastore_inc = kastore_proj.get_variable('kastore_inc') cc = meson.get_compiler('c') m_dep = cc.find_library('m', required: false) lib_deps = [m_dep, kastore_dep] extra_c_args = [ '-Wall', '-Wextra', '-Werror', '-Wpedantic', '-W', '-Wmissing-prototypes', '-Wstrict-prototypes', '-Wconversion', '-Wshadow', '-Wpointer-arith', '-Wcast-align', '-Wcast-qual', '-Wwrite-strings', '-Wnested-externs', '-fshort-enums', '-fno-common'] + debug_c_args lib_sources = [ 'tskit/core.c', 'tskit/tables.c', 'tskit/trees.c', 'tskit/genotypes.c', 'tskit/stats.c', 'tskit/convert.c', 'tskit/haplotype_matching.c'] lib_headers = [ 'tskit/core.h', 'tskit/tables.h', 'tskit/trees.h', 'tskit/genotypes.h', 'tskit/stats.h', 'tskit/convert.h', 'tskit/haplotype_matching.h'] # Subprojects use the static library for simplicity. tskit_inc = [kastore_inc, include_directories(['.'])] tskit_lib = static_library('tskit', sources: lib_sources, dependencies: lib_deps) tskit_dep = declare_dependency(include_directories:tskit_inc, link_with: tskit_lib) if not meson.is_subproject() # Shared library install target. shared_library('tskit', sources: lib_sources, dependencies: lib_deps, c_args: extra_c_args, install: true) install_headers('tskit.h') install_headers(lib_headers, subdir: 'tskit') cunit_dep = dependency('cunit') # We don't specify extra C args here as CUnit won't pass the checks. test_lib = static_library('testlib', sources: ['tests/testlib.c'], dependencies: [cunit_dep, kastore_dep, tskit_dep]) test_core = executable('test_core', sources: ['tests/test_core.c'], link_with: [tskit_lib, test_lib], c_args: extra_c_args+['-DMESON_PROJECT_VERSION="@0@"'.format(meson.project_version())], dependencies: kastore_dep, ) test('core', test_core) test_tables = executable('test_tables', sources: ['tests/test_tables.c'], link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) test('tables', test_tables) test_trees = executable('test_trees', sources: ['tests/test_trees.c'], link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) test('trees', test_trees) test_genotypes = executable('test_genotypes', sources: ['tests/test_genotypes.c'], link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) test('genotypes', test_genotypes) test_convert = executable('test_convert', sources: ['tests/test_convert.c'], link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) test('convert', test_convert) test_stats = executable('test_stats', sources: ['tests/test_stats.c'], link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) test('stats', test_stats) test_haplotype_matching = executable('test_haplotype_matching', sources: ['tests/test_haplotype_matching.c'], link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) test('haplotype_matching', test_haplotype_matching) test_file_format = executable('test_file_format', sources: ['tests/test_file_format.c'], link_with: [tskit_lib, test_lib], c_args: extra_c_args, dependencies: kastore_dep) test('file_format', test_file_format) test_minimal_cpp = executable('test_minimal_cpp', sources: ['tests/test_minimal_cpp.cpp'], link_with: [tskit_lib], dependencies: kastore_dep) test('minimal_cpp', test_minimal_cpp) if get_option('build_examples') # These example programs use less portable features, # and we don't want to always compile them. Use, e.g., # meson build -Dbuild_examples=false executable('api_structure', sources: ['examples/api_structure.c'], link_with: [tskit_lib], dependencies: lib_deps) executable('error_handling', sources: ['examples/error_handling.c'], link_with: [tskit_lib], dependencies: lib_deps) executable('tree_iteration', sources: ['examples/tree_iteration.c'], link_with: [tskit_lib], dependencies: lib_deps) executable('tree_traversal', sources: ['examples/tree_traversal.c'], link_with: [tskit_lib], dependencies: lib_deps) executable('streaming', sources: ['examples/streaming.c'], link_with: [tskit_lib], dependencies: lib_deps) executable('cpp_sorting_example', sources: ['examples/cpp_sorting_example.cpp'], link_with: [tskit_lib], dependencies: lib_deps) executable('haploid_wright_fisher', sources: ['examples/haploid_wright_fisher.c'], link_with: [tskit_lib], dependencies: lib_deps) executable('multichrom_wright_fisher_singlethreaded', sources: ['examples/multichrom_wright_fisher_singlethreaded.c'], link_with: [tskit_lib], dependencies: lib_deps) executable('json_struct_metadata', sources: ['examples/json_struct_metadata.c'], link_with: [tskit_lib], dependencies: lib_deps) thread_dep = dependency('threads') executable('multichrom_wright_fisher', sources: ['examples/multichrom_wright_fisher.c'], link_with: [tskit_lib], dependencies: [m_dep, kastore_dep, thread_dep]) endif endif ================================================ FILE: c/meson_options.txt ================================================ option('build_examples', type : 'boolean', value : true) ================================================ FILE: c/subprojects/kastore/README.md ================================================ This directory is an abbreviated version of the kastore distribution source. All files should be updated when we are updating to a new kastore version. ================================================ FILE: c/subprojects/kastore/VERSION.txt ================================================ 2.1.2 ================================================ FILE: c/subprojects/kastore/kastore.c ================================================ #include #include #include #include #include #include #include "kastore.h" /* Private flag used to indicate when we have opened the file ourselves * and need to free it. */ /* Note: we use 1<<14 to keep this flag at the end of the flag space, * and this is the highest bit that can be guaranteed to fit into * an int. */ #define OWN_FILE (1 << 14) const char * kas_strerror(int err) { const char *ret = "Unknown error"; switch (err) { case KAS_ERR_GENERIC: ret = "Generic error; please file a bug report"; break; case KAS_ERR_IO: if (errno != 0) { ret = strerror(errno); } else { ret = "I/O error with errno unset. Please file a bug report"; } break; case KAS_ERR_BAD_MODE: ret = "Bad open mode; must be \"r\", \"w\", or \"a\""; break; case KAS_ERR_BAD_FLAGS: ret = "Unknown flags specified. Only (KAS_GET_TAKES_OWNERSHIP and/or" "KAS_READ_ALL) or 0 can be specified " "for open, and KAS_BORROWS_ARRAY or 0 for put"; break; case KAS_ERR_NO_MEMORY: ret = "Out of memory"; break; case KAS_ERR_BAD_FILE_FORMAT: ret = "File not in KAS format"; break; case KAS_ERR_VERSION_TOO_OLD: ret = "File format version is too old. Please upgrade using " "'kas upgrade '"; break; case KAS_ERR_VERSION_TOO_NEW: ret = "File format version is too new. Please upgrade your " "kastore library version"; break; case KAS_ERR_BAD_TYPE: ret = "Unknown data type"; break; case KAS_ERR_DUPLICATE_KEY: ret = "Duplicate key provided"; break; case KAS_ERR_KEY_NOT_FOUND: ret = "Key not found"; break; case KAS_ERR_EMPTY_KEY: ret = "Keys cannot be empty"; break; case KAS_ERR_ILLEGAL_OPERATION: ret = "Cannot perform the requested operation in the current mode"; break; case KAS_ERR_TYPE_MISMATCH: ret = "Mismatch between requested and stored types for array"; break; case KAS_ERR_EOF: ret = "End of file"; break; } return ret; } kas_version_t kas_version(void) { kas_version_t version; version.major = KAS_VERSION_MAJOR; version.minor = KAS_VERSION_MINOR; version.patch = KAS_VERSION_PATCH; return version; } static size_t type_size(int type) { const size_t type_size_map[] = { 1, 1, 2, 2, 4, 4, 8, 8, 4, 8 }; assert(type < KAS_NUM_TYPES); return type_size_map[type]; } /* Compare item keys lexicographically. */ static int compare_items(const void *a, const void *b) { const kaitem_t *ia = (const kaitem_t *) a; const kaitem_t *ib = (const kaitem_t *) b; size_t len = ia->key_len < ib->key_len ? ia->key_len : ib->key_len; int ret = memcmp(ia->key, ib->key, len); if (ret == 0) { ret = (ia->key_len > ib->key_len) - (ia->key_len < ib->key_len); } return ret; } /* When a read error occurs we don't know whether this is because the file * ended unexpectedly or an IO error occured. If the file ends unexpectedly * this is a file format error. */ static int KAS_WARN_UNUSED kastore_get_read_io_error(kastore_t *self) { int ret = KAS_ERR_IO; if (feof(self->file) || errno == 0) { ret = KAS_ERR_BAD_FILE_FORMAT; } return ret; } static int KAS_WARN_UNUSED kastore_write_header(kastore_t *self) { int ret = 0; char header[KAS_HEADER_SIZE]; uint16_t version_major = KAS_FILE_VERSION_MAJOR; uint16_t version_minor = KAS_FILE_VERSION_MINOR; uint32_t num_items = (uint32_t) self->num_items; uint64_t file_size = (uint64_t) self->file_size; memset(header, 0, sizeof(header)); memcpy(header, KAS_MAGIC, 8); memcpy(header + 8, &version_major, 2); memcpy(header + 10, &version_minor, 2); memcpy(header + 12, &num_items, 4); memcpy(header + 16, &file_size, 8); /* Rest of header is reserved */ if (fwrite(header, KAS_HEADER_SIZE, 1, self->file) != 1) { ret = KAS_ERR_IO; goto out; } out: return ret; } static int KAS_WARN_UNUSED kastore_read_header(kastore_t *self) { int ret = 0; char header[KAS_HEADER_SIZE]; uint16_t version_major, version_minor; uint32_t num_items; uint64_t file_size; size_t count; count = fread(header, 1, KAS_HEADER_SIZE, self->file); if (count == 0 && feof(self->file)) { ret = KAS_ERR_EOF; goto out; } else if (count != KAS_HEADER_SIZE) { ret = kastore_get_read_io_error(self); goto out; } if (strncmp(header, KAS_MAGIC, 8) != 0) { ret = KAS_ERR_BAD_FILE_FORMAT; goto out; } memcpy(&version_major, header + 8, 2); memcpy(&version_minor, header + 10, 2); memcpy(&num_items, header + 12, 4); memcpy(&file_size, header + 16, 8); self->file_version[0] = (int) version_major; self->file_version[1] = (int) version_minor; if (self->file_version[0] < KAS_FILE_VERSION_MAJOR) { ret = KAS_ERR_VERSION_TOO_OLD; goto out; } else if (self->file_version[0] > KAS_FILE_VERSION_MAJOR) { ret = KAS_ERR_VERSION_TOO_NEW; goto out; } self->num_items = num_items; self->file_size = (size_t) file_size; if (self->file_size < KAS_HEADER_SIZE) { ret = KAS_ERR_BAD_FILE_FORMAT; goto out; } out: return ret; } /* Compute the locations of the keys and arrays in the file. */ static void kastore_pack_items(kastore_t *self) { size_t j, offset, remainder; /* Pack the keys */ offset = KAS_HEADER_SIZE + self->num_items * KAS_ITEM_DESCRIPTOR_SIZE; for (j = 0; j < self->num_items; j++) { self->items[j].key_start = offset; offset += self->items[j].key_len; } /* Pack the arrays */ for (j = 0; j < self->num_items; j++) { remainder = offset % KAS_ARRAY_ALIGN; if (remainder != 0) { offset += KAS_ARRAY_ALIGN - remainder; } self->items[j].array_start = offset; offset += self->items[j].array_len * type_size(self->items[j].type); } self->file_size = offset; } static int KAS_WARN_UNUSED kastore_write_descriptors(kastore_t *self) { int ret = 0; size_t j; uint8_t type; uint64_t key_start, key_len, array_start, array_len; char descriptor[KAS_ITEM_DESCRIPTOR_SIZE]; for (j = 0; j < self->num_items; j++) { memset(descriptor, 0, KAS_ITEM_DESCRIPTOR_SIZE); type = (uint8_t) self->items[j].type; key_start = (uint64_t) self->items[j].key_start; key_len = (uint64_t) self->items[j].key_len; array_start = (uint64_t) self->items[j].array_start; array_len = (uint64_t) self->items[j].array_len; memcpy(descriptor, &type, 1); /* Bytes 1-8 are reserved */ memcpy(descriptor + 8, &key_start, 8); memcpy(descriptor + 16, &key_len, 8); memcpy(descriptor + 24, &array_start, 8); memcpy(descriptor + 32, &array_len, 8); /* Rest of descriptor is reserved */ if (fwrite(descriptor, sizeof(descriptor), 1, self->file) != 1) { ret = KAS_ERR_IO; goto out; } } out: return ret; } static int KAS_WARN_UNUSED kastore_read_descriptors(kastore_t *self) { int ret = KAS_ERR_BAD_FILE_FORMAT; size_t j; uint8_t type; uint64_t key_start, key_len, array_start, array_len; char *descriptor; size_t descriptor_offset, offset, remainder, size, count; char *read_buffer = NULL; size = self->num_items * KAS_ITEM_DESCRIPTOR_SIZE; if (size + KAS_HEADER_SIZE > self->file_size) { goto out; } read_buffer = (char *) malloc(size); if (read_buffer == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; } count = fread(read_buffer, size, 1, self->file); if (count == 0) { ret = kastore_get_read_io_error(self); goto out; } descriptor_offset = 0; for (j = 0; j < self->num_items; j++) { descriptor = read_buffer + descriptor_offset; descriptor_offset += KAS_ITEM_DESCRIPTOR_SIZE; memcpy(&type, descriptor, 1); memcpy(&key_start, descriptor + 8, 8); memcpy(&key_len, descriptor + 16, 8); memcpy(&array_start, descriptor + 24, 8); memcpy(&array_len, descriptor + 32, 8); if (type >= KAS_NUM_TYPES) { ret = KAS_ERR_BAD_TYPE; goto out; } self->items[j].type = (int) type; if (key_start + key_len > self->file_size) { goto out; } self->items[j].key_start = (size_t) key_start; self->items[j].key_len = (size_t) key_len; if (array_start + array_len * type_size(type) > self->file_size) { goto out; } self->items[j].array_start = (size_t) array_start; self->items[j].array_len = (size_t) array_len; } /* Check the integrity of the key and array packing. Keys must * be packed sequentially starting immediately after the descriptors. */ offset = KAS_HEADER_SIZE + self->num_items * KAS_ITEM_DESCRIPTOR_SIZE; for (j = 0; j < self->num_items; j++) { if (self->items[j].key_start != offset) { ret = KAS_ERR_BAD_FILE_FORMAT; goto out; } offset += self->items[j].key_len; } for (j = 0; j < self->num_items; j++) { /* Arrays are 8 byte aligned and adjacent */ remainder = offset % KAS_ARRAY_ALIGN; if (remainder != 0) { offset += KAS_ARRAY_ALIGN - remainder; } if (self->items[j].array_start != offset) { ret = KAS_ERR_BAD_FILE_FORMAT; goto out; } offset += self->items[j].array_len * type_size(self->items[j].type); } if (offset != self->file_size) { ret = KAS_ERR_BAD_FILE_FORMAT; goto out; } ret = 0; out: kas_safe_free(read_buffer); return ret; } static int KAS_WARN_UNUSED kastore_write_data(kastore_t *self) { int ret = 0; size_t j, size, offset, padding; char pad[KAS_ARRAY_ALIGN] = { 0, 0, 0, 0, 0, 0, 0 }; const void *write_array; offset = KAS_HEADER_SIZE + self->num_items * KAS_ITEM_DESCRIPTOR_SIZE; /* Write the keys. */ for (j = 0; j < self->num_items; j++) { assert(offset == self->items[j].key_start); if (fwrite(self->items[j].key, self->items[j].key_len, 1, self->file) != 1) { ret = KAS_ERR_IO; goto out; } offset += self->items[j].key_len; } /* Write the arrays. */ for (j = 0; j < self->num_items; j++) { padding = self->items[j].array_start - offset; assert(padding < KAS_ARRAY_ALIGN); if (padding > 0 && fwrite(pad, padding, 1, self->file) != 1) { ret = KAS_ERR_IO; goto out; } size = self->items[j].array_len * type_size(self->items[j].type); write_array = self->items[j].borrowed_array != NULL ? self->items[j].borrowed_array : self->items[j].array; assert(write_array != NULL); if (size > 0 && fwrite(write_array, size, 1, self->file) != 1) { ret = KAS_ERR_IO; goto out; } offset = self->items[j].array_start + size; } out: return ret; } static int KAS_WARN_UNUSED kastore_read_file(kastore_t *self) { int ret = 0; size_t count, size, offset, j; bool read_all = !!(self->flags & KAS_READ_ALL); offset = KAS_HEADER_SIZE + self->num_items * KAS_ITEM_DESCRIPTOR_SIZE; /* Read in up to the start of first array. This will contain all the keys. */ size = self->items[0].array_start; assert(size > offset); size -= offset; self->key_read_buffer = (char *) malloc(size); if (self->key_read_buffer == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; } count = fread(self->key_read_buffer, size, 1, self->file); if (count == 0) { ret = kastore_get_read_io_error(self); goto out; } /* Assign the pointers for the keys and arrays */ for (j = 0; j < self->num_items; j++) { /* keys are already loaded in the read buffer */ self->items[j].key = self->key_read_buffer + self->items[j].key_start - offset; if (read_all) { if (j == self->num_items - 1) { size = self->file_size - self->items[j].array_start; } else { size = self->items[j + 1].array_start - self->items[j].array_start; } self->items[j].array = (char *) malloc(size == 0 ? 1 : size); if (self->items[j].array == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; } if (size > 0) { count = fread(self->items[j].array, size, 1, self->file); if (count == 0) { ret = kastore_get_read_io_error(self); goto out; } } } } out: return ret; } static int KAS_WARN_UNUSED kastore_read_item(kastore_t *self, kaitem_t *item) { int ret = 0; int err; size_t size = item->array_len * type_size(item->type); size_t count; item->array = malloc(size == 0 ? 1 : size); if (item->array == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; } if (size > 0) { err = fseek(self->file, self->file_offset + (long) item->array_start, SEEK_SET); if (err != 0) { ret = KAS_ERR_IO; goto out; } count = fread(item->array, size, 1, self->file); if (count == 0) { ret = kastore_get_read_io_error(self); goto out; } } out: return ret; } static int KAS_WARN_UNUSED kastore_write_file(kastore_t *self) { int ret = 0; qsort(self->items, self->num_items, sizeof(kaitem_t), compare_items); kastore_pack_items(self); ret = kastore_write_header(self); if (ret != 0) { goto out; } ret = kastore_write_descriptors(self); if (ret != 0) { goto out; } ret = kastore_write_data(self); if (ret != 0) { goto out; } out: return ret; } static int KAS_WARN_UNUSED kastore_read(kastore_t *self) { int ret = 0; if (!(self->flags & KAS_READ_ALL)) { /* Record the current file offset, in case this is a multi-store file, * so that we can seek to the correct location in kastore_read_item(). */ self->file_offset = ftell(self->file); if (self->file_offset == -1) { ret = KAS_ERR_IO; goto out; } } ret = kastore_read_header(self); if (ret != 0) { goto out; } if (self->num_items > 0) { self->items = (kaitem_t *) calloc(self->num_items, sizeof(*self->items)); if (self->items == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; } ret = kastore_read_descriptors(self); if (ret != 0) { goto out; } ret = kastore_read_file(self); if (ret != 0) { goto out; } } else if (self->file_size != KAS_HEADER_SIZE) { ret = KAS_ERR_BAD_FILE_FORMAT; goto out; } out: return ret; } static int KAS_WARN_UNUSED kastore_insert_all(kastore_t *self, kastore_t *other) { size_t j; int ret = 0; kaitem_t item; for (j = 0; j < other->num_items; j++) { item = other->items[j]; ret = kastore_put( self, item.key, item.key_len, item.array, item.array_len, item.type, 0); if (ret != 0) { goto out; } } out: return ret; } int KAS_WARN_UNUSED kastore_open(kastore_t *self, const char *filename, const char *mode, int flags) { int ret = 0; const char *file_mode; bool appending = false; kastore_t tmp; FILE *file; int err; memset(self, 0, sizeof(*self)); memset(&tmp, 0, sizeof(tmp)); if (strlen(mode) != 1) { ret = KAS_ERR_BAD_MODE; goto out; } if (strncmp(mode, "r", 1) == 0) { file_mode = "rb"; } else if (strncmp(mode, "w", 1) == 0) { file_mode = "wb"; } else if (strncmp(mode, "a", 1) == 0) { mode = "w"; file_mode = "wb"; appending = true; } else { ret = KAS_ERR_BAD_MODE; goto out; } if (appending) { ret = kastore_open(&tmp, filename, "r", KAS_READ_ALL); if (ret != 0) { goto out; } /* tmp will now have read all of the data into memory. We can now * close its file. We have to do this for Windows. */ err = fclose(tmp.file); tmp.file = NULL; if (err != 0) { ret = KAS_ERR_IO; goto out; } } file = fopen(filename, file_mode); if (file == NULL) { ret = KAS_ERR_IO; goto out; } ret = kastore_openf(self, file, mode, flags); if (ret != 0) { (void) fclose(file); } else { self->flags |= OWN_FILE; if (appending) { ret = kastore_insert_all(self, &tmp); } } out: if (appending) { kastore_close(&tmp); } return ret; } int KAS_WARN_UNUSED kastore_openf(kastore_t *self, FILE *file, const char *mode, int flags) { int ret = 0; memset(self, 0, sizeof(*self)); if (strlen(mode) != 1) { ret = KAS_ERR_BAD_MODE; goto out; } if (strncmp(mode, "r", 1) == 0) { self->mode = KAS_READ; } else if (strncmp(mode, "w", 1) == 0) { self->mode = KAS_WRITE; } else { ret = KAS_ERR_BAD_MODE; goto out; } if (flags > (KAS_READ_ALL | KAS_GET_TAKES_OWNERSHIP) || flags < 0) { ret = KAS_ERR_BAD_FLAGS; goto out; } self->flags = flags; self->file = file; if (self->mode == KAS_READ) { ret = kastore_read(self); } out: return ret; } int KAS_WARN_UNUSED kastore_close(kastore_t *self) { int ret = 0; int err; size_t j; if (self->mode == KAS_WRITE) { if (self->file != NULL) { ret = kastore_write_file(self); if (ret != 0) { /* Ignore errors on close now */ if (self->flags & OWN_FILE) { fclose(self->file); } self->file = NULL; } } if (self->items != NULL) { /* We only alloc memory for the keys and arrays in write mode */ for (j = 0; j < self->num_items; j++) { kas_safe_free(self->items[j].key); kas_safe_free(self->items[j].array); } } } else { kas_safe_free(self->key_read_buffer); if (self->items != NULL) { for (j = 0; j < self->num_items; j++) { kas_safe_free(self->items[j].array); } } } kas_safe_free(self->items); if (self->file != NULL && (self->flags & OWN_FILE)) { err = fclose(self->file); if (err != 0) { ret = KAS_ERR_IO; } } memset(self, 0, sizeof(*self)); return ret; } static int kastore_find_item(kastore_t *self, const char *key, size_t key_len, kaitem_t **item) { int ret = KAS_ERR_KEY_NOT_FOUND; kaitem_t search; search.key = (char *) malloc(key_len); search.key_len = key_len; if (self->mode != KAS_READ) { ret = KAS_ERR_ILLEGAL_OPERATION; goto out; } if (search.key == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; } memcpy(search.key, key, key_len); *item = bsearch( &search, self->items, self->num_items, sizeof(kaitem_t), compare_items); if (*item == NULL) { goto out; } ret = 0; out: kas_safe_free(search.key); return ret; } int KAS_WARN_UNUSED kastore_contains(kastore_t *self, const char *key, size_t key_len) { kaitem_t *item; int ret = kastore_find_item(self, key, key_len, &item); if (ret == 0) { ret = 1; } else if (ret == KAS_ERR_KEY_NOT_FOUND) { ret = 0; } return ret; } int KAS_WARN_UNUSED kastore_containss(kastore_t *self, const char *key) { return kastore_contains(self, key, strlen(key)); } int KAS_WARN_UNUSED kastore_get(kastore_t *self, const char *key, size_t key_len, void **array, size_t *array_len, int *type) { kaitem_t *item; int ret = kastore_find_item(self, key, key_len, &item); if (ret != 0) { goto out; } if (item->array == NULL) { ret = kastore_read_item(self, item); if (ret != 0) { goto out; } } *array = item->array; *array_len = item->array_len; *type = item->type; if (self->flags & KAS_GET_TAKES_OWNERSHIP) { item->array = NULL; } ret = 0; out: return ret; } int KAS_WARN_UNUSED kastore_gets( kastore_t *self, const char *key, void **array, size_t *array_len, int *type) { return kastore_get(self, key, strlen(key), array, array_len, type); } static int KAS_WARN_UNUSED kastore_gets_type( kastore_t *self, const char *key, void **array, size_t *array_len, int type) { int loaded_type = -1; int ret; ret = kastore_get(self, key, strlen(key), array, array_len, &loaded_type); if (ret != 0) { goto out; } if (type != loaded_type) { ret = KAS_ERR_TYPE_MISMATCH; goto out; } out: return ret; } int KAS_WARN_UNUSED kastore_gets_int8(kastore_t *self, const char *key, int8_t **array, size_t *array_len) { return kastore_gets_type(self, key, (void **) array, array_len, KAS_INT8); } int KAS_WARN_UNUSED kastore_gets_uint8(kastore_t *self, const char *key, uint8_t **array, size_t *array_len) { return kastore_gets_type(self, key, (void **) array, array_len, KAS_UINT8); } int KAS_WARN_UNUSED kastore_gets_int16(kastore_t *self, const char *key, int16_t **array, size_t *array_len) { return kastore_gets_type(self, key, (void **) array, array_len, KAS_INT16); } int KAS_WARN_UNUSED kastore_gets_uint16( kastore_t *self, const char *key, uint16_t **array, size_t *array_len) { return kastore_gets_type(self, key, (void **) array, array_len, KAS_UINT16); } int KAS_WARN_UNUSED kastore_gets_int32(kastore_t *self, const char *key, int32_t **array, size_t *array_len) { return kastore_gets_type(self, key, (void **) array, array_len, KAS_INT32); } int KAS_WARN_UNUSED kastore_gets_uint32( kastore_t *self, const char *key, uint32_t **array, size_t *array_len) { return kastore_gets_type(self, key, (void **) array, array_len, KAS_UINT32); } int KAS_WARN_UNUSED kastore_gets_int64(kastore_t *self, const char *key, int64_t **array, size_t *array_len) { return kastore_gets_type(self, key, (void **) array, array_len, KAS_INT64); } int KAS_WARN_UNUSED kastore_gets_uint64( kastore_t *self, const char *key, uint64_t **array, size_t *array_len) { return kastore_gets_type(self, key, (void **) array, array_len, KAS_UINT64); } int KAS_WARN_UNUSED kastore_gets_float32(kastore_t *self, const char *key, float **array, size_t *array_len) { return kastore_gets_type(self, key, (void **) array, array_len, KAS_FLOAT32); } int KAS_WARN_UNUSED kastore_gets_float64(kastore_t *self, const char *key, double **array, size_t *array_len) { return kastore_gets_type(self, key, (void **) array, array_len, KAS_FLOAT64); } static int KAS_WARN_UNUSED kastore_put_item(kastore_t *self, kaitem_t **ret_item, const char *key, size_t key_len, int type, int KAS_UNUSED(flags)) { int ret = 0; kaitem_t *new_item; kaitem_t *p; size_t j; if (self->mode != KAS_WRITE) { ret = KAS_ERR_ILLEGAL_OPERATION; goto out; } if (type < 0 || type >= KAS_NUM_TYPES) { ret = KAS_ERR_BAD_TYPE; goto out; } if (key_len == 0) { ret = KAS_ERR_EMPTY_KEY; goto out; } /* This isn't terribly efficient, but we're not expecting large * numbers of items. */ p = (kaitem_t *) realloc(self->items, (self->num_items + 1) * sizeof(*self->items)); if (p == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; } self->items = p; new_item = self->items + self->num_items; memset(new_item, 0, sizeof(*new_item)); new_item->type = type; new_item->key_len = key_len; new_item->key = (char *) malloc(key_len); if (new_item->key == NULL) { kas_safe_free(new_item->key); ret = KAS_ERR_NO_MEMORY; goto out; } self->num_items++; memcpy(new_item->key, key, key_len); /* Check if this key is already in here. OK, this is a quadratic time * algorithm, but we're not expecting to have lots of items (< 100). In * this case, the simple algorithm is probably better. If/when we ever * deal with more items than this, then we will need a better algorithm. */ for (j = 0; j < self->num_items - 1; j++) { if (compare_items(new_item, self->items + j) == 0) { /* Free the key memory and remove this item */ self->num_items--; kas_safe_free(new_item->key); ret = KAS_ERR_DUPLICATE_KEY; goto out; } } *ret_item = new_item; out: return ret; } static int KAS_WARN_UNUSED kastore_bput(kastore_t *self, const char *key, size_t key_len, const void *array, size_t array_len, int type, int flags) { int ret = 0; kaitem_t *item; ret = kastore_put_item(self, &item, key, key_len, type, flags); if (ret != 0) { goto out; } if (array == NULL) { /* Both can't be null, so assign a dummy array */ item->array = malloc(1); } else { item->borrowed_array = array; } item->borrowed_array = array; item->array_len = array_len; out: return ret; } int KAS_WARN_UNUSED kastore_put(kastore_t *self, const char *key, size_t key_len, const void *array, size_t array_len, int type, int flags) { int ret; size_t array_size; void *array_copy = NULL; if (flags != KAS_BORROWS_ARRAY && flags != 0) { ret = KAS_ERR_BAD_FLAGS; goto out; } if (type < 0 || type >= KAS_NUM_TYPES) { ret = KAS_ERR_BAD_TYPE; goto out; } if (flags & KAS_BORROWS_ARRAY) { ret = kastore_bput(self, key, key_len, array, array_len, type, flags); } else { array_size = type_size(type) * array_len; array_copy = malloc(array_size == 0 ? 1 : array_size); if (array_copy == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; } memcpy(array_copy, array, array_size); ret = kastore_oput(self, key, key_len, array_copy, array_len, type, flags); if (ret == 0) { /* Kastore has taken ownership of the array, so we don't need to free it */ array_copy = NULL; } } out: kas_safe_free(array_copy); return ret; } int KAS_WARN_UNUSED kastore_oput(kastore_t *self, const char *key, size_t key_len, void *array, size_t array_len, int type, int flags) { int ret = 0; kaitem_t *item; if (flags != 0) { ret = KAS_ERR_BAD_FLAGS; goto out; } ret = kastore_put_item(self, &item, key, key_len, type, flags); if (ret != 0) { goto out; } item->array = array; item->array_len = array_len; out: return ret; } int KAS_WARN_UNUSED kastore_puts(kastore_t *self, const char *key, const void *array, size_t array_len, int type, int flags) { return kastore_put(self, key, strlen(key), array, array_len, type, flags); } int KAS_WARN_UNUSED kastore_puts_int8( kastore_t *self, const char *key, const int8_t *array, size_t array_len, int flags) { return kastore_puts(self, key, (const void *) array, array_len, KAS_INT8, flags); } int KAS_WARN_UNUSED kastore_puts_uint8( kastore_t *self, const char *key, const uint8_t *array, size_t array_len, int flags) { return kastore_puts(self, key, (const void *) array, array_len, KAS_UINT8, flags); } int KAS_WARN_UNUSED kastore_puts_int16( kastore_t *self, const char *key, const int16_t *array, size_t array_len, int flags) { return kastore_puts(self, key, (const void *) array, array_len, KAS_INT16, flags); } int KAS_WARN_UNUSED kastore_puts_uint16( kastore_t *self, const char *key, const uint16_t *array, size_t array_len, int flags) { return kastore_puts(self, key, (const void *) array, array_len, KAS_UINT16, flags); } int KAS_WARN_UNUSED kastore_puts_int32( kastore_t *self, const char *key, const int32_t *array, size_t array_len, int flags) { return kastore_puts(self, key, (const void *) array, array_len, KAS_INT32, flags); } int KAS_WARN_UNUSED kastore_puts_uint32( kastore_t *self, const char *key, const uint32_t *array, size_t array_len, int flags) { return kastore_puts(self, key, (const void *) array, array_len, KAS_UINT32, flags); } int KAS_WARN_UNUSED kastore_puts_int64( kastore_t *self, const char *key, const int64_t *array, size_t array_len, int flags) { return kastore_puts(self, key, (const void *) array, array_len, KAS_INT64, flags); } int KAS_WARN_UNUSED kastore_puts_uint64( kastore_t *self, const char *key, const uint64_t *array, size_t array_len, int flags) { return kastore_puts(self, key, (const void *) array, array_len, KAS_UINT64, flags); } int KAS_WARN_UNUSED kastore_puts_float32( kastore_t *self, const char *key, const float *array, size_t array_len, int flags) { return kastore_puts(self, key, (const void *) array, array_len, KAS_FLOAT32, flags); } int KAS_WARN_UNUSED kastore_puts_float64( kastore_t *self, const char *key, const double *array, size_t array_len, int flags) { return kastore_puts(self, key, (const void *) array, array_len, KAS_FLOAT64, flags); } int KAS_WARN_UNUSED kastore_oputs( kastore_t *self, const char *key, void *array, size_t array_len, int type, int flags) { return kastore_oput(self, key, strlen(key), array, array_len, type, flags); } int KAS_WARN_UNUSED kastore_oputs_int8( kastore_t *self, const char *key, int8_t *array, size_t array_len, int flags) { return kastore_oputs(self, key, (void *) array, array_len, KAS_INT8, flags); } int KAS_WARN_UNUSED kastore_oputs_uint8( kastore_t *self, const char *key, uint8_t *array, size_t array_len, int flags) { return kastore_oputs(self, key, (void *) array, array_len, KAS_UINT8, flags); } int KAS_WARN_UNUSED kastore_oputs_int16( kastore_t *self, const char *key, int16_t *array, size_t array_len, int flags) { return kastore_oputs(self, key, (void *) array, array_len, KAS_INT16, flags); } int KAS_WARN_UNUSED kastore_oputs_uint16( kastore_t *self, const char *key, uint16_t *array, size_t array_len, int flags) { return kastore_oputs(self, key, (void *) array, array_len, KAS_UINT16, flags); } int KAS_WARN_UNUSED kastore_oputs_int32( kastore_t *self, const char *key, int32_t *array, size_t array_len, int flags) { return kastore_oputs(self, key, (void *) array, array_len, KAS_INT32, flags); } int KAS_WARN_UNUSED kastore_oputs_uint32( kastore_t *self, const char *key, uint32_t *array, size_t array_len, int flags) { return kastore_oputs(self, key, (void *) array, array_len, KAS_UINT32, flags); } int KAS_WARN_UNUSED kastore_oputs_int64( kastore_t *self, const char *key, int64_t *array, size_t array_len, int flags) { return kastore_oputs(self, key, (void *) array, array_len, KAS_INT64, flags); } int KAS_WARN_UNUSED kastore_oputs_uint64( kastore_t *self, const char *key, uint64_t *array, size_t array_len, int flags) { return kastore_oputs(self, key, (void *) array, array_len, KAS_UINT64, flags); } int KAS_WARN_UNUSED kastore_oputs_float32( kastore_t *self, const char *key, float *array, size_t array_len, int flags) { return kastore_oputs(self, key, (void *) array, array_len, KAS_FLOAT32, flags); } int KAS_WARN_UNUSED kastore_oputs_float64( kastore_t *self, const char *key, double *array, size_t array_len, int flags) { return kastore_oputs(self, key, (void *) array, array_len, KAS_FLOAT64, flags); } void kastore_print_state(kastore_t *self, FILE *out) { kaitem_t *item; size_t j; fprintf(out, "============================\n"); fprintf(out, "kastore state\n"); fprintf(out, "file_version = %d.%d\n", self->file_version[0], self->file_version[1]); fprintf(out, "mode = %d\n", self->mode); fprintf(out, "flags = %d\n", self->flags); fprintf(out, "num_items = %zu\n", self->num_items); fprintf(out, "file_size = %zu\n", self->file_size); fprintf(out, "own_file = %d\n", !!(self->flags & OWN_FILE)); fprintf(out, "file = '%p'\n", (void *) self->file); fprintf(out, "============================\n"); for (j = 0; j < self->num_items; j++) { item = self->items + j; fprintf(out, "%.*s: type=%d, key_start=%zu, key_len=%zu, key=%p, " "array_start=%zu, array_len=%zu, array=%p\n", (int) item->key_len, item->key, item->type, item->key_start, item->key_len, (void *) item->key, item->array_start, item->array_len, (void *) item->array); } fprintf(out, "============================\n"); } ================================================ FILE: c/subprojects/kastore/kastore.h ================================================ /** * @file kastore.h * @brief Public API for kastore. * * This is the API documentation for kastore. */ #ifndef KASTORE_H #define KASTORE_H #ifdef __cplusplus extern "C" { #endif #ifdef __GNUC__ #define KAS_WARN_UNUSED __attribute__((warn_unused_result)) #define KAS_UNUSED(x) KAS_UNUSED_##x __attribute__((__unused__)) #else #define KAS_WARN_UNUSED #define KAS_UNUSED(x) KAS_UNUSED_##x #endif #include #include #include #include /** @defgroup ERROR_GROUP Error return values. @{ */ // clang-format off /** Generic error thrown when no other message can be generated. */ #define KAS_ERR_GENERIC -1 /** An error occured during IO. */ #define KAS_ERR_IO -2 /** An unrecognised mode string was passed to open(). */ #define KAS_ERR_BAD_MODE -3 /** Out-of-memory condition. */ #define KAS_ERR_NO_MEMORY -4 /** Attempt to read an unknown file format. */ #define KAS_ERR_BAD_FILE_FORMAT -5 /** The file is in kastore format, but the version is too old for this version of the library to read. */ #define KAS_ERR_VERSION_TOO_OLD -6 /** The file is in kastore format, but the version is too new for this version of the library to read. */ #define KAS_ERR_VERSION_TOO_NEW -7 /** An unknown type key was specified. */ #define KAS_ERR_BAD_TYPE -8 /** A zero-length key was specified. */ #define KAS_ERR_EMPTY_KEY -9 /** A duplicate key was specified. */ #define KAS_ERR_DUPLICATE_KEY -10 /** The requested key does not exist in the store. */ #define KAS_ERR_KEY_NOT_FOUND -11 /** The requestion function cannot be called in the current mode. */ #define KAS_ERR_ILLEGAL_OPERATION -12 /** The requested type does not match the type of the stored values. */ #define KAS_ERR_TYPE_MISMATCH -13 /** End of file was reached while reading data. */ #define KAS_ERR_EOF -14 /** Unknown flags were provided to open. */ #define KAS_ERR_BAD_FLAGS -15 /** @} */ /* Flags for open */ #define KAS_READ_ALL (1 << 0) #define KAS_GET_TAKES_OWNERSHIP (1 << 1) /* Flags for put */ #define KAS_BORROWS_ARRAY (1 << 8) /** @defgroup TYPE_GROUP Data types. @{ */ #define KAS_INT8 0 #define KAS_UINT8 1 #define KAS_INT16 2 #define KAS_UINT16 3 #define KAS_INT32 4 #define KAS_UINT32 5 #define KAS_INT64 6 #define KAS_UINT64 7 #define KAS_FLOAT32 8 #define KAS_FLOAT64 9 /** @} */ #define KAS_NUM_TYPES 10 #define KAS_READ 1 #define KAS_WRITE 2 /** @defgroup FILE_VERSION_GROUP File version macros. @{ */ /** The file version major number. Incremented when any breaking changes are made to the file format. */ #define KAS_FILE_VERSION_MAJOR 1 /** The file version minor number. Incremented when non-breaking backward-compatible changes are made to the file format. */ #define KAS_FILE_VERSION_MINOR 0 /** @} */ /** @defgroup API_VERSION_GROUP API version macros. @{ */ /** The library major version. Incremented when breaking changes to the API or ABI are introduced. This includes any changes to the signatures of functions and the sizes and types of externally visible structs. */ #define KAS_VERSION_MAJOR 2 /** The library minor version. Incremented when non-breaking backward-compatible changes to the API or ABI are introduced, i.e., the addition of a new function. */ #define KAS_VERSION_MINOR 1 /** The library patch version. Incremented when any changes not relevant to the to the API or ABI are introduced, i.e., internal refactors of bugfixes. */ #define KAS_VERSION_PATCH 2 /** @} */ #define KAS_HEADER_SIZE 64 #define KAS_ITEM_DESCRIPTOR_SIZE 64 #define KAS_MAGIC "\211KAS\r\n\032\n" #define KAS_ARRAY_ALIGN 8 // clang-format on #ifndef KAS_BUG_ASSERT_MESSAGE #define KAS_BUG_ASSERT_MESSAGE \ "If you are using kastore directly please open an issue on" \ " GitHub, ideally with a reproducible example." \ " (https://github.com/tskit-dev/kastore/issues) If you are" \ " using software that uses kastore, please report an issue" \ " to that software's issue tracker, at least initially." #endif /** We often wish to assert a condition that is unexpected, but using the normal `assert` means compiling without NDEBUG. This macro still asserts when NDEBUG is defined. */ #define kas_bug_assert(condition) \ do { \ if (!(condition)) { \ fprintf(stderr, "Bug detected in %s at line %d. %s\n", __FILE__, __LINE__, \ KAS_BUG_ASSERT_MESSAGE); \ abort(); \ } \ } while (0) typedef struct { int type; size_t key_len; size_t array_len; char *key; /* Used when KAS_BORROWS_ARRAY is set */ const void *borrowed_array; void *array; size_t key_start; size_t array_start; } kaitem_t; /** @brief A file-backed store of key-array values. */ typedef struct { int flags; int mode; int file_version[2]; size_t num_items; kaitem_t *items; FILE *file; size_t file_size; long file_offset; char *key_read_buffer; } kastore_t; /** @brief Library version information. */ typedef struct { /** @brief The major version number. */ int major; /** @brief The minor version number. */ int minor; /** @brief The patch version number. */ int patch; } kas_version_t; /** @brief Open a store from a given file in read ("r"), write ("w") or append ("a") mode. @rst In read mode, a store can be queried using the :ref:`get functions ` and any attempts to write to the store will return an error. In write and append mode, the store can written to using the :ref:`put functions ` and any attempt to read will return an error. After :c:func:`kastore_open` has been called on a particular store, :c:func:`kastore_close` must be called to avoid leaking memory. This must also be done when :c:func:`kastore_open` returns an error. When opened in read-mode, the default is to read key/array values from file on demand. This is useful when a subset of the data is required and we don't wish to read the entire file. If the entire file is to be read, the ``KAS_READ_ALL`` flag may be specified to improve performance. **Flags** KAS_READ_ALL If this option is specified, read the entire file at open time. This will give slightly better performance as the file can be read sequentially in a single pass. KAS_GET_TAKES_OWNERSHIP If this option is specified, all ``get`` operations will transfer ownership of the array to the caller. ``kastore`` will not ``free`` the array memory and this is the responsibility of the caller. If ``get`` is called on the same key multiple times, a new buffer will be returned each time. Note that second and subsequent ``get`` calls on a given key will result in ``seek`` operations even when the KAS_READ_ALL flag is set, and will therefore fail on unseekable streams. @endrst @param self A pointer to a kastore object. @param filename The file path to open. @param mode The open mode: can be read ("r"), write ("w") or append ("a"). @param flags The open flags. @return Return 0 on success or a negative value on failure. */ int kastore_open(kastore_t *self, const char *filename, const char *mode, int flags); /** @brief Open a store from a given FILE pointer. @rst Behaviour, mode and flags follow that of :c:func:`kastore_open`, except append mode is not supported. The ``file`` argument must be opened in an appropriate mode (e.g. "r" for a kastore in "r" mode). Files open with other modes will result in KAS_ERR_IO being returned when read/write operations are attempted. The FILE will not be closed when :c:func:`kastore_close` is called. If the KAS_READ_ALL flag is supplied, no ``seek`` operations will be performed on the FILE and so streams such as stdin, FIFOs etc are supported. The FILE pointer will be positioned exactly at the end of the kastore encoded bytes once reading is completed, and reading multiple stores from the same FILE sequentially is fully supported. @endrst @param self A pointer to a kastore object. @param file The FILE* to read/write the store from/to. @param mode The open mode: can be read ("r") or write ("w"). @param flags The open flags. @return Return 0 on success or a negative value on failure. */ int kastore_openf(kastore_t *self, FILE *file, const char *mode, int flags); /** @brief Close an opened store, freeing all resources. Any store that has been opened must be closed to avoid memory leaks (including cases in which errors have occured). It is not an error to call ``kastore_close`` multiple times on the same object, but ``kastore_open`` must be called before ``kastore_close``. @param self A pointer to a kastore object. @return Return 0 on success or a negative value on failure. */ int kastore_close(kastore_t *self); /** @brief Return 1 if the store contains the specified key and 0 if it does not. @rst Queries the store for the specified key and returns 1 if it exists. If the key does not exist, 0 is returned. If an error occurs (for example, if querying the store while it is in write-mode), a negative value is returned. For keys that are standard NULL terminated strings, the :c:func:`kastore_containss` function may be more convenient. @endrst @param self A pointer to a kastore object. @param key The key. @param key_len The length of the key. @return Return 1 if the key is present and 0 if it does not. If an error occurs, return a negative value. */ int kastore_contains(kastore_t *self, const char *key, size_t key_len); /** @brief Return 1 if the store contains the specified NULL terminated key and 0 if it does not. @rst Queries the store for the specified key, which must be a NULL terminated string, and returns 1 if it exists. If the key does not exist, 0 is returned. If an error occurs (for example, if querying the store while it is in write-mode), a negative value is returned. the array in the specified destination pointers. @endrst @param self A pointer to a kastore object. @param key The key. @return Return 1 if the key is present and 0 if it does not. If an error occurs, return a negative value. */ int kastore_containss(kastore_t *self, const char *key); /** @brief Get the array for the specified key. @rst Queries the store for the specified key and stores pointers to the memory for the corresponding array, the number of elements in this array and the type of the array in the specified destination pointers. This is the most general form of ``get`` query in kastore, as non NULL-terminated strings can be used as keys and the resulting array is returned in a generic pointer. When standard C strings are used as keys and the type of the array is known, it is more convenient to use the :ref:`typed variants ` of this function. The returned array points to memory that is internally managed by the store and must not be freed or modified. The pointer is guaranteed to be valid until :c:func:`kastore_close` is called. @endrst @param self A pointer to a kastore object. @param key The key. @param key_len The length of the key. @param array The destination pointer for the array. @param array_len The destination pointer for the number of elements in the array. @param type The destination pointer for the type code of the array. @return Return 0 on success or a negative value on failure. */ int kastore_get(kastore_t *self, const char *key, size_t key_len, void **array, size_t *array_len, int *type); /** @brief Get the array for the specified NULL-terminated key. @rst As for :c:func:`kastore_get()` except the key is a NULL-terminated string. @endrst @param self A pointer to a kastore object. @param key The key. @param array The destination pointer for the array. @param array_len The destination pointer for the number of elements in the array. @param type The destination pointer for the type code of the array. @return Return 0 on success or a negative value on failure. */ int kastore_gets( kastore_t *self, const char *key, void **array, size_t *array_len, int *type); /** @defgroup TYPED_GETS_GROUP Typed get functions. @{ */ int kastore_gets_int8( kastore_t *self, const char *key, int8_t **array, size_t *array_len); int kastore_gets_uint8( kastore_t *self, const char *key, uint8_t **array, size_t *array_len); int kastore_gets_int16( kastore_t *self, const char *key, int16_t **array, size_t *array_len); int kastore_gets_uint16( kastore_t *self, const char *key, uint16_t **array, size_t *array_len); int kastore_gets_int32( kastore_t *self, const char *key, int32_t **array, size_t *array_len); int kastore_gets_uint32( kastore_t *self, const char *key, uint32_t **array, size_t *array_len); int kastore_gets_int64( kastore_t *self, const char *key, int64_t **array, size_t *array_len); int kastore_gets_uint64( kastore_t *self, const char *key, uint64_t **array, size_t *array_len); int kastore_gets_float32( kastore_t *self, const char *key, float **array, size_t *array_len); int kastore_gets_float64( kastore_t *self, const char *key, double **array, size_t *array_len); /** @} */ /** @brief Insert the specified key-array pair into the store. @rst A key with the specified length is inserted into the store and associated with an array of the specified type and number of elements. The contents of the specified key and array are copied unless the KAS_BORROWS_ARRAY flag is specified. If KAS_BORROWS_ARRAY is specified the array buffer must persist until the kastore is closed. Keys can be any sequence of bytes but must be at least one byte long and be unique. There is no restriction on the contents of arrays. This is the most general form of ``put`` operation in kastore; when the type of the array is known and the keys are standard C strings, it is usually more convenient to use the :ref:`typed variants ` of this function. @endrst @param self A pointer to a kastore object. @param key The key. @param key_len The length of the key. @param array The array. @param array_len The number of elements in the array. @param type The type of the array. @param flags The insertion flags, only KAS_BORROWS_ARRAY or 0 is a valid. @return Return 0 on success or a negative value on failure. */ int kastore_put(kastore_t *self, const char *key, size_t key_len, const void *array, size_t array_len, int type, int flags); /** @brief Insert the specified NULL terminated key and array pair into the store. @rst As for :c:func:`kastore_put` except the key must be NULL-terminated C string. @endrst @param self A pointer to a kastore object. @param key The key. @param array The array. @param array_len The number of elements in the array. @param type The type of the array. @param flags The insertion flags, only KAS_BORROWS_ARRAY or 0 is a valid. @return Return 0 on success or a negative value on failure. */ int kastore_puts(kastore_t *self, const char *key, const void *array, size_t array_len, int type, int flags); /** @defgroup TYPED_PUTS_GROUP Typed put functions. @{ */ int kastore_puts_int8( kastore_t *self, const char *key, const int8_t *array, size_t array_len, int flags); int kastore_puts_uint8( kastore_t *self, const char *key, const uint8_t *array, size_t array_len, int flags); int kastore_puts_int16( kastore_t *self, const char *key, const int16_t *array, size_t array_len, int flags); int kastore_puts_uint16(kastore_t *self, const char *key, const uint16_t *array, size_t array_len, int flags); int kastore_puts_int32( kastore_t *self, const char *key, const int32_t *array, size_t array_len, int flags); int kastore_puts_uint32(kastore_t *self, const char *key, const uint32_t *array, size_t array_len, int flags); int kastore_puts_int64( kastore_t *self, const char *key, const int64_t *array, size_t array_len, int flags); int kastore_puts_uint64(kastore_t *self, const char *key, const uint64_t *array, size_t array_len, int flags); int kastore_puts_float32( kastore_t *self, const char *key, const float *array, size_t array_len, int flags); int kastore_puts_float64( kastore_t *self, const char *key, const double *array, size_t array_len, int flags); /** @} */ /** @brief Insert the specified key-array pair into the store, transferring ownership of the malloced array buffer to the store (own-put). @rst A key with the specified length is inserted into the store and associated with an array of the specified type and number of elements. The contents of the specified key is copied, but the array buffer is taken directly and freed when the store is closed. The array buffer must be a pointer returned by ``malloc`` or ``calloc``. Ownership of the buffer is not taken unless the function returns successfully. Apart from taking ownership of the array buffer, the semantics of this function are identical to :c:func:`kastore_put`. @endrst @param self A pointer to a kastore object. @param key The key. @param key_len The length of the key. @param array The array. Must be a pointer returned by malloc/calloc. @param array_len The number of elements in the array. @param type The type of the array. @param flags The insertion flags. Currently unused. @return Return 0 on success or a negative value on failure. */ int kastore_oput(kastore_t *self, const char *key, size_t key_len, void *array, size_t array_len, int type, int flags); /** @brief Insert the specified NULL terminated key and array pair into the store, transferring ownership of the malloced array buffer to the store (own-put). @rst As for :c:func:`kastore_oput` except the key must be NULL-terminated C string. @endrst @param self A pointer to a kastore object. @param key The key. @param array The array. Must be a pointer returned by malloc/calloc. @param array_len The number of elements in the array. @param type The type of the array. @param flags The insertion flags. Currently unused. @return Return 0 on success or a negative value on failure. */ int kastore_oputs(kastore_t *self, const char *key, void *array, size_t array_len, int type, int flags); /** @defgroup TYPED_OPUTS_GROUP Typed own-and-put functions. @{ */ int kastore_oputs_int8( kastore_t *self, const char *key, int8_t *array, size_t array_len, int flags); int kastore_oputs_uint8( kastore_t *self, const char *key, uint8_t *array, size_t array_len, int flags); int kastore_oputs_int16( kastore_t *self, const char *key, int16_t *array, size_t array_len, int flags); int kastore_oputs_uint16( kastore_t *self, const char *key, uint16_t *array, size_t array_len, int flags); int kastore_oputs_int32( kastore_t *self, const char *key, int32_t *array, size_t array_len, int flags); int kastore_oputs_uint32( kastore_t *self, const char *key, uint32_t *array, size_t array_len, int flags); int kastore_oputs_int64( kastore_t *self, const char *key, int64_t *array, size_t array_len, int flags); int kastore_oputs_uint64( kastore_t *self, const char *key, uint64_t *array, size_t array_len, int flags); int kastore_oputs_float32( kastore_t *self, const char *key, float *array, size_t array_len, int flags); int kastore_oputs_float64( kastore_t *self, const char *key, double *array, size_t array_len, int flags); /** @} */ void kastore_print_state(kastore_t *self, FILE *out); /** @brief Returns a description of the specified error code. @param err The error code. @return String describing the error code. */ const char *kas_strerror(int err); /** @brief Returns the API version. @rst The API follows the `semver convention `_, where the major, minor and patch numbers have specific meanings. The versioning scheme here also takes into account ABI compatability. @endrst */ kas_version_t kas_version(void); #define kas_safe_free(pointer) \ do { \ if (pointer != NULL) { \ free(pointer); \ pointer = NULL; \ } \ } while (0) #ifdef __cplusplus } #endif #endif ================================================ FILE: c/subprojects/kastore/meson.build ================================================ project('kastore', ['c', 'cpp'], version: files('VERSION.txt'), default_options: [ 'c_std=c99', 'cpp_std=c++11', 'warning_level=3', 'werror=true']) if not meson.is_subproject() add_global_arguments([ '-W', '-Wmissing-prototypes', '-Wstrict-prototypes', '-Wconversion', '-Wshadow', '-Wpointer-arith', '-Wcast-align', '-Wcast-qual', '-Wwrite-strings', '-Wnested-externs', '-fshort-enums', '-fno-common'], language : 'c') endif # Subprojects should compile in the static library for simplicity. kastore_inc = include_directories('.') kastore = static_library('kastore', 'kastore.c') kastore_dep = declare_dependency(link_with : kastore, include_directories: kastore_inc) if not meson.is_subproject() # The shared library can be installed into the system. install_headers('kastore.h') shared_library('kastore', 'kastore.c', install: true) executable('example', ['example.c'], link_with: kastore) cunit_dep = dependency('cunit') src_root = meson.project_source_root() tests_exe = executable('tests', ['tests.c', 'kastore.c'], dependencies: cunit_dep, c_args: ['-DMESON_VERSION="@0@"'.format(meson.project_version())]) test('tests', tests_exe, env: ['KAS_TEST_DATA_PREFIX=' + src_root + '/test-data/']) cpp_tests_exe = executable('cpp_tests', ['cpp_tests.cpp'], link_with: kastore) test('cpp_tests', cpp_tests_exe) malloc_tests_exe = executable('malloc_tests', ['malloc_tests.c', 'kastore.c'], dependencies: cunit_dep, link_args:['-Wl,--wrap=malloc', '-Wl,--wrap=realloc', '-Wl,--wrap=calloc']) test('malloc_tests', malloc_tests_exe, workdir: src_root) io_tests_exe = executable('io_tests', ['io_tests.c', 'kastore.c'], dependencies: cunit_dep, link_args:[ '-Wl,--wrap=fwrite', '-Wl,--wrap=fread', '-Wl,--wrap=fclose', '-Wl,--wrap=ftell', '-Wl,--wrap=fseek']) test('io_tests', io_tests_exe, workdir: src_root) endif ================================================ FILE: c/tests/meson-subproject/example.c ================================================ /* * MIT License * * Copyright (c) 2019-2022 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* Simple example testing that we compile and link in tskit and kastore * when we use meson submodules. */ #include #include #include #include void test_kas_strerror() { printf("test_kas_strerror\n"); const char *str = kas_strerror(KAS_ERR_NO_MEMORY); assert(strcmp(str, "Out of memory") == 0); } void test_strerror() { printf("test_strerror\n"); const char *str = tsk_strerror(TSK_ERR_NO_MEMORY); assert(strcmp(str, "Out of memory. (TSK_ERR_NO_MEMORY)") == 0); } void test_load_error() { printf("test_open_error\n"); tsk_treeseq_t ts; int ret = tsk_treeseq_load(&ts, "no such file", 0); assert(ret == TSK_ERR_IO); tsk_treeseq_free(&ts); } void test_table_basics() { printf("test_table_basics\n"); tsk_table_collection_t tables; int ret = tsk_table_collection_init(&tables, 0); assert(ret == 0); ret = tsk_node_table_add_row(&tables.nodes, 0, 1.0, TSK_NULL, TSK_NULL, NULL, 0); assert(ret == 0); ret = tsk_node_table_add_row(&tables.nodes, 0, 2.0, TSK_NULL, TSK_NULL, NULL, 0); assert(ret == 1); assert(tables.nodes.num_rows == 2); tsk_table_collection_free(&tables); } int main() { test_kas_strerror(); test_strerror(); test_load_error(); test_table_basics(); return 0; } ================================================ FILE: c/tests/meson-subproject/meson.build ================================================ project('example', 'c') tskit_proj = subproject('tskit') tskit_dep = tskit_proj.get_variable('tskit_dep') executable('example', 'example.c', dependencies : [tskit_dep], install : true) ================================================ FILE: c/tests/test_convert.c ================================================ /* * MIT License * * Copyright (c) 2019-2022 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "testlib.h" #include #include #include static void test_single_tree_newick(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; size_t buffer_size = 1024; char newick[buffer_size]; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0) ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK) ret = tsk_convert_newick(&t, 0, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Seems odd, but this is what a single node newick tree looks like. * Newick parsers seems to accept it in any case */ CU_ASSERT_STRING_EQUAL(newick, "1;"); ret = tsk_convert_newick(&t, 0, 0, 0, buffer_size, newick); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_STRING_EQUAL(newick, "n0;"); ret = tsk_convert_newick(&t, 4, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_STRING_EQUAL(newick, "(1:1,2:1);"); ret = tsk_convert_newick(&t, 4, 0, 0, buffer_size, newick); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_STRING_EQUAL(newick, "(n0:1,n1:1);"); ret = tsk_convert_newick(&t, 6, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_STRING_EQUAL(newick, "((1:1,2:1):2,(3:2,4:2):1);"); ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, newick); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_STRING_EQUAL(newick, "((n0:1,n1:1):2,(n2:2,n3:2):1);"); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_single_tree_newick_errors(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; size_t j, len; size_t buffer_size = 1024; char newick[buffer_size]; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0) ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK) ret = tsk_convert_newick(&t, -1, 1, 0, buffer_size, newick); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_convert_newick(&t, 7, 1, 0, buffer_size, newick); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, newick); CU_ASSERT_EQUAL_FATAL(ret, 0); len = 1 + strlen(newick); for (j = 0; j < len; j++) { ret = tsk_convert_newick(&t, 6, 0, 0, j, newick); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BUFFER_OVERFLOW); } ret = tsk_convert_newick(&t, 6, 0, TSK_NEWICK_LEGACY_MS_LABELS, len, newick); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_STRING_EQUAL(newick, "((1:1,2:1):2,(3:2,4:2):1);"); tsk_tree_free(&t); tsk_treeseq_free(&ts); } int main(int argc, char **argv) { CU_TestInfo tests[] = { { "test_single_tree_newick", test_single_tree_newick }, { "test_single_tree_newick_errors", test_single_tree_newick_errors }, { NULL, NULL }, }; return test_main(tests, argc, argv); } ================================================ FILE: c/tests/test_core.c ================================================ /* * MIT License * * Copyright (c) 2019-2024 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "testlib.h" #include #include #include #include static void test_strerror(void) { int j; const char *msg; int max_error_code = 8192; /* totally arbitrary */ for (j = 0; j < max_error_code; j++) { msg = tsk_strerror(-j); CU_ASSERT_FATAL(msg != NULL); CU_ASSERT(strlen(msg) > 0); } CU_ASSERT_STRING_EQUAL( tsk_strerror(0), "Normal exit condition. This is not an error!"); } static void test_strerror_kastore(void) { int kastore_errors[] = { KAS_ERR_NO_MEMORY, KAS_ERR_KEY_NOT_FOUND, KAS_ERR_BAD_FILE_FORMAT }; size_t j; int err; for (j = 0; j < sizeof(kastore_errors) / sizeof(*kastore_errors); j++) { err = tsk_set_kas_error(kastore_errors[j]); CU_ASSERT_TRUE(tsk_is_kas_error(err)); CU_ASSERT_EQUAL_FATAL(tsk_get_kas_error(err), kastore_errors[j]); CU_ASSERT_STRING_EQUAL(tsk_strerror(err), kas_strerror(kastore_errors[j])); } } static void test_generate_uuid(void) { size_t uuid_size = 36; char uuid[uuid_size + 1]; char other_uuid[uuid_size + 1]; int ret; ret = tsk_generate_uuid(uuid, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(strlen(uuid), uuid_size); CU_ASSERT_EQUAL(uuid[8], '-'); CU_ASSERT_EQUAL(uuid[13], '-'); CU_ASSERT_EQUAL(uuid[18], '-'); CU_ASSERT_EQUAL(uuid[23], '-'); ret = tsk_generate_uuid(other_uuid, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(strlen(other_uuid), uuid_size); CU_ASSERT_STRING_NOT_EQUAL(uuid, other_uuid); } static void test_double_round(void) { struct test_case { double source; unsigned int num_digits; double result; }; struct test_case test_cases[] = { { 1.555, 3, 1.555 }, { 1.5555, 2, 1.56 }, /* catch the halfway between integers case */ { 1.5555, 3, 1.556 }, { 1.5111, 3, 1.511 }, { 1.5112, 3, 1.511 }, { 3.141592653589793, 0, 3.0 }, { 3.141592653589793, 1, 3.1 }, { 3.141592653589793, 2, 3.14 }, { 3.141592653589793, 3, 3.142 }, { 3.141592653589793, 4, 3.1416 }, { 3.141592653589793, 5, 3.14159 }, { 3.141592653589793, 6, 3.141593 }, { 3.141592653589793, 7, 3.1415927 }, { 3.141592653589793, 8, 3.14159265 }, { 3.141592653589793, 9, 3.141592654 }, { 3.141592653589793, 10, 3.1415926536 }, { 3.141592653589793, 11, 3.14159265359 }, { 3.141592653589793, 12, 3.14159265359 }, { 3.141592653589793, 13, 3.1415926535898 }, { 3.141592653589793, 14, 3.14159265358979 }, { 3.141592653589793, 15, 3.141592653589793 }, { 3.141592653589793, 16, 3.141592653589793 }, { 3.141592653589793, 17, 3.141592653589793 }, { 3.141592653589793, 18, 3.141592653589793 }, { 3.141592653589793, 19, 3.141592653589793 }, /* We have tiny differences in precision at k=20; not worth worrying about. */ { 3.141592653589793, 21, 3.141592653589793 }, { 3.141592653589793, 22, 3.141592653589793 }, { 3.141592653589793, 23, 3.141592653589793 }, { 0.3333333333333333, 0, 0.0 }, { 0.3333333333333333, 1, 0.3 }, { 0.3333333333333333, 2, 0.33 }, { 0.3333333333333333, 3, 0.333 }, { 0.3333333333333333, 4, 0.3333 }, { 0.3333333333333333, 5, 0.33333 }, { 0.3333333333333333, 6, 0.333333 }, { 0.3333333333333333, 7, 0.3333333 }, { 0.3333333333333333, 8, 0.33333333 }, { 0.3333333333333333, 9, 0.333333333 }, { 0.3333333333333333, 10, 0.3333333333 }, { 0.3333333333333333, 11, 0.33333333333 }, { 0.3333333333333333, 12, 0.333333333333 }, { 0.3333333333333333, 13, 0.3333333333333 }, { 0.3333333333333333, 14, 0.33333333333333 }, { 0.3333333333333333, 15, 0.333333333333333 }, { 0.3333333333333333, 16, 0.3333333333333333 }, { 0.3333333333333333, 17, 0.3333333333333333 }, { 0.3333333333333333, 18, 0.3333333333333333 }, { 0.3333333333333333, 19, 0.3333333333333333 }, { 0.3333333333333333, 20, 0.3333333333333333 }, { 0.3333333333333333, 21, 0.3333333333333333 }, { 0.3333333333333333, 22, 0.3333333333333333 }, { 0.3333333333333333, 23, 0.3333333333333333 }, { 0.6666666666666666, 0, 1.0 }, { 0.6666666666666666, 1, 0.7 }, { 0.6666666666666666, 2, 0.67 }, { 0.6666666666666666, 3, 0.667 }, { 0.6666666666666666, 4, 0.6667 }, { 0.6666666666666666, 5, 0.66667 }, { 0.6666666666666666, 6, 0.666667 }, { 0.6666666666666666, 7, 0.6666667 }, { 0.6666666666666666, 8, 0.66666667 }, { 0.6666666666666666, 9, 0.666666667 }, { 0.6666666666666666, 10, 0.6666666667 }, { 0.6666666666666666, 11, 0.66666666667 }, { 0.6666666666666666, 12, 0.666666666667 }, { 0.6666666666666666, 13, 0.6666666666667 }, { 0.6666666666666666, 14, 0.66666666666667 }, { 0.6666666666666666, 15, 0.666666666666667 }, { 0.6666666666666666, 16, 0.6666666666666666 }, { 0.6666666666666666, 17, 0.6666666666666666 }, { 0.6666666666666666, 18, 0.6666666666666666 }, { 0.6666666666666666, 19, 0.6666666666666666 }, { 0.6666666666666666, 20, 0.6666666666666666 }, { 0.6666666666666666, 21, 0.6666666666666666 }, { 0.6666666666666666, 22, 0.6666666666666666 }, { 0.6666666666666666, 23, 0.6666666666666666 }, { 0.07692307692307693, 0, 0.0 }, { 0.07692307692307693, 1, 0.1 }, { 0.07692307692307693, 2, 0.08 }, { 0.07692307692307693, 3, 0.077 }, { 0.07692307692307693, 4, 0.0769 }, { 0.07692307692307693, 5, 0.07692 }, { 0.07692307692307693, 6, 0.076923 }, { 0.07692307692307693, 7, 0.0769231 }, { 0.07692307692307693, 8, 0.07692308 }, { 0.07692307692307693, 9, 0.076923077 }, { 0.07692307692307693, 10, 0.0769230769 }, { 0.07692307692307693, 11, 0.07692307692 }, { 0.07692307692307693, 12, 0.076923076923 }, { 0.07692307692307693, 13, 0.0769230769231 }, { 0.07692307692307693, 14, 0.07692307692308 }, { 0.07692307692307693, 15, 0.076923076923077 }, { 0.07692307692307693, 16, 0.0769230769230769 }, { 0.07692307692307693, 17, 0.07692307692307693 }, { 0.07692307692307693, 18, 0.07692307692307693 }, { 0.07692307692307693, 19, 0.07692307692307693 }, { 0.07692307692307693, 20, 0.07692307692307693 }, /* Tiny difference in precision at k=21 */ { 0.07692307692307693, 22, 0.07692307692307693 }, { 0.07692307692307693, 23, 0.07692307692307693 }, { 1e-21, 0, 0.0 }, { 1e-21, 1, 0.0 }, { 1e-21, 2, 0.0 }, { 1e-21, 3, 0.0 }, { 1e-21, 4, 0.0 }, { 1e-21, 5, 0.0 }, { 1e-21, 6, 0.0 }, { 1e-21, 7, 0.0 }, { 1e-21, 8, 0.0 }, { 1e-21, 9, 0.0 }, { 1e-21, 10, 0.0 }, { 1e-21, 11, 0.0 }, { 1e-21, 12, 0.0 }, { 1e-21, 13, 0.0 }, { 1e-21, 14, 0.0 }, { 1e-21, 15, 0.0 }, { 1e-21, 16, 0.0 }, { 1e-21, 17, 0.0 }, { 1e-21, 18, 0.0 }, { 1e-21, 19, 0.0 }, { 1e-21, 20, 0.0 }, { 1e-21, 21, 1e-21 }, { 1e-21, 22, 1e-21 }, { 1e-21, 23, 1e-21 }, { 1e-10, 0, 0.0 }, { 1e-10, 1, 0.0 }, { 1e-10, 2, 0.0 }, { 1e-10, 3, 0.0 }, { 1e-10, 4, 0.0 }, { 1e-10, 5, 0.0 }, { 1e-10, 6, 0.0 }, { 1e-10, 7, 0.0 }, { 1e-10, 8, 0.0 }, { 1e-10, 9, 0.0 }, { 1e-10, 10, 1e-10 }, { 1e-10, 11, 1e-10 }, { 1e-10, 12, 1e-10 }, { 1e-10, 13, 1e-10 }, { 1e-10, 14, 1e-10 }, { 1e-10, 15, 1e-10 }, { 1e-10, 16, 1e-10 }, { 1e-10, 17, 1e-10 }, { 1e-10, 18, 1e-10 }, { 1e-10, 19, 1e-10 }, { 1e-10, 20, 1e-10 }, { 1e-10, 21, 1e-10 }, { 1e-10, 22, 1e-10 }, { 1e-10, 23, 1e-10 }, { 3.141592653589793e-08, 0, 0.0 }, { 3.141592653589793e-08, 1, 0.0 }, { 3.141592653589793e-08, 2, 0.0 }, { 3.141592653589793e-08, 3, 0.0 }, { 3.141592653589793e-08, 4, 0.0 }, { 3.141592653589793e-08, 5, 0.0 }, { 3.141592653589793e-08, 6, 0.0 }, { 3.141592653589793e-08, 7, 0.0 }, { 3.141592653589793e-08, 8, 3e-08 }, { 3.141592653589793e-08, 9, 3.1e-08 }, { 3.141592653589793e-08, 10, 3.14e-08 }, { 3.141592653589793e-08, 11, 3.142e-08 }, { 3.141592653589793e-08, 12, 3.1416e-08 }, { 3.141592653589793e-08, 13, 3.14159e-08 }, { 3.141592653589793e-08, 14, 3.141593e-08 }, { 3.141592653589793e-08, 15, 3.1415927e-08 }, { 3.141592653589793e-08, 16, 3.14159265e-08 }, { 3.141592653589793e-08, 17, 3.141592654e-08 }, { 3.141592653589793e-08, 18, 3.1415926536e-08 }, { 3.141592653589793e-08, 19, 3.14159265359e-08 }, { 3.141592653589793e-08, 20, 3.14159265359e-08 }, { 3.141592653589793e-08, 21, 3.1415926535898e-08 }, /* Tiny precision mismatch at k=22 */ { 3.141592653589793e-08, 23, 3.141592653589793e-08 }, }; size_t num_test_cases = sizeof(test_cases) / sizeof(*test_cases); size_t j; for (j = 0; j < num_test_cases; j++) { CU_ASSERT_EQUAL_FATAL(tsk_round(test_cases[j].source, test_cases[j].num_digits), test_cases[j].result); } } static void test_blkalloc(void) { tsk_blkalloc_t alloc; int ret; size_t j, block_size; void *mem; ret = tsk_blkalloc_init(&alloc, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_blkalloc_free(&alloc); for (block_size = 1; block_size < 10; block_size++) { ret = tsk_blkalloc_init(&alloc, block_size); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < 10; j++) { mem = tsk_blkalloc_get(&alloc, block_size); CU_ASSERT_TRUE(mem != NULL); CU_ASSERT_EQUAL(alloc.num_chunks, j + 1); tsk_memset(mem, 0, block_size); } mem = tsk_blkalloc_get(&alloc, block_size + 1); CU_ASSERT_EQUAL(mem, NULL); mem = tsk_blkalloc_get(&alloc, block_size + 2); CU_ASSERT_EQUAL(mem, NULL); tsk_blkalloc_print_state(&alloc, _devnull); tsk_blkalloc_free(&alloc); } /* Allocate awkward sized chunk */ ret = tsk_blkalloc_init(&alloc, 100); CU_ASSERT_EQUAL_FATAL(ret, 0); mem = tsk_blkalloc_get(&alloc, 90); CU_ASSERT_FATAL(mem != NULL); tsk_memset(mem, 0, 90); mem = tsk_blkalloc_get(&alloc, 10); CU_ASSERT_FATAL(mem != NULL); tsk_memset(mem, 0, 10); CU_ASSERT_EQUAL(alloc.num_chunks, 1); mem = tsk_blkalloc_get(&alloc, 90); CU_ASSERT_FATAL(mem != NULL); tsk_memset(mem, 0, 90); CU_ASSERT_EQUAL(alloc.num_chunks, 2); mem = tsk_blkalloc_get(&alloc, 11); CU_ASSERT_FATAL(mem != NULL); tsk_memset(mem, 0, 11); CU_ASSERT_EQUAL(alloc.num_chunks, 3); tsk_blkalloc_free(&alloc); } static void test_unknown_time(void) { CU_ASSERT_TRUE(tsk_isnan(TSK_UNKNOWN_TIME)); CU_ASSERT_TRUE(tsk_is_unknown_time(TSK_UNKNOWN_TIME)); CU_ASSERT_FALSE(tsk_is_unknown_time(NAN)); CU_ASSERT_FALSE(tsk_is_unknown_time(0)); CU_ASSERT_FALSE(tsk_is_unknown_time(INFINITY)); CU_ASSERT_FALSE(tsk_is_unknown_time(1)); } static void test_malloc_zero(void) { void *p = tsk_malloc(0); CU_ASSERT_FATAL(p != NULL); free(p); p = tsk_calloc(0, 1); CU_ASSERT_FATAL(p != NULL); free(p); } static void test_malloc_overflow(void) { #if TSK_MAX_SIZE > SIZE_MAX tsk_size_t size_max = SIZE_MAX; void *p = tsk_malloc(size_max + 1); CU_ASSERT_FATAL(p == NULL); p = tsk_calloc(size_max + 1, 1); CU_ASSERT_FATAL(p == NULL); #endif } static void test_debug_stream(void) { FILE *f = fopen(_tmp_file_name, "w"); CU_ASSERT_FATAL(tsk_get_debug_stream() == stdout); CU_ASSERT_FATAL(tsk_get_debug_stream() == stdout); tsk_set_debug_stream(f); CU_ASSERT_FATAL(tsk_get_debug_stream() == f); tsk_set_debug_stream(stdout); CU_ASSERT_FATAL(tsk_get_debug_stream() == stdout); fclose(f); } static int validate_avl_node(tsk_avl_node_int_t *node) { int height, lheight, rheight; if (node == NULL) { return 0; } lheight = validate_avl_node(node->llink); rheight = validate_avl_node(node->rlink); height = 1 + TSK_MAX(lheight, rheight); if (lheight != 0 && rheight != 0) { CU_ASSERT_FATAL(node->balance == rheight - lheight); } else if (lheight == 0 && rheight == 0) { CU_ASSERT_FATAL(height == 1); CU_ASSERT_FATAL(node->balance == 0); } else { CU_ASSERT_FATAL(height == 2); if (lheight == 0) { CU_ASSERT_FATAL(node->balance == 1); } else { CU_ASSERT_FATAL(node->balance == -1); } } return height; } static void test_avl_empty(void) { int height; tsk_avl_tree_int_t tree; tsk_avl_tree_int_init(&tree); height = validate_avl_node(tree.head.rlink); CU_ASSERT_EQUAL((tsk_size_t) height, tree.height); CU_ASSERT_EQUAL(0, tree.size); tsk_avl_tree_int_print_state(&tree, _devnull); CU_ASSERT_EQUAL(tsk_avl_tree_int_search(&tree, -1), NULL); CU_ASSERT_EQUAL(tsk_avl_tree_int_search(&tree, 0), NULL); CU_ASSERT_EQUAL(tsk_avl_tree_int_search(&tree, 1), NULL); tsk_avl_tree_int_free(&tree); } static void validate_avl(size_t num_keys, int64_t *keys) { size_t j, k; int ret, height; tsk_avl_tree_int_t tree; tsk_avl_node_int_t *nodes = malloc(num_keys * sizeof(*nodes)); tsk_avl_node_int_t **ordered_nodes = malloc(num_keys * sizeof(*ordered_nodes)); tsk_avl_node_int_t *node; tsk_avl_node_int_t tmp_node; CU_ASSERT_FATAL(nodes != NULL); CU_ASSERT_FATAL(ordered_nodes != NULL); tsk_avl_tree_int_init(&tree); /* Assumes the keys are unique */ for (j = 0; j < num_keys; j++) { node = nodes + j; node->key = keys[j]; CU_ASSERT_EQUAL(tsk_avl_tree_int_search(&tree, keys[j]), NULL); ret = tsk_avl_tree_int_insert(&tree, node); CU_ASSERT_FATAL(ret == 0); CU_ASSERT_EQUAL(tsk_avl_tree_int_search(&tree, keys[j]), node); tmp_node.key = keys[j]; ret = tsk_avl_tree_int_insert(&tree, &tmp_node); CU_ASSERT_FATAL(ret == 1); height = validate_avl_node(tree.head.rlink); CU_ASSERT_EQUAL((tsk_size_t) height, tree.height); CU_ASSERT_EQUAL(j + 1, tree.size); tsk_avl_tree_int_print_state(&tree, _devnull); for (k = j + 1; k < num_keys; k++) { CU_ASSERT_EQUAL(tsk_avl_tree_int_search(&tree, keys[k]), NULL); } } tsk_avl_tree_int_ordered_nodes(&tree, ordered_nodes); for (j = 0; j < num_keys; j++) { if (j > 0) { CU_ASSERT_FATAL(ordered_nodes[j - 1]->key < ordered_nodes[j]->key); } } tsk_avl_tree_int_free(&tree); free(nodes); free(ordered_nodes); } static void test_avl_sequential(void) { int64_t keys[] = { 0, 1, 2, 3, 4, 5, 6, 7 }; int64_t reversed_keys[] = { 7, 6, 5, 4, 3, 2, 1, 0 }; validate_avl(8, keys); validate_avl(8, reversed_keys); } static void test_avl_interleaved(void) { size_t num_keys = 100; size_t j; int64_t *keys = malloc(num_keys * sizeof(*keys)); CU_ASSERT_FATAL(keys != NULL); for (j = 0; j < num_keys; j++) { keys[j] = (int64_t) j; if (j % 2 == 0) { keys[j] *= -1; } } validate_avl(num_keys, keys); free(keys); } static void test_avl_random(void) { /* This example goes through all the code paths in the AVL insert algorithm */ int64_t keys[] = { 2, 79, -8, -86, 6, -29, 88, -80, 21, -26, -13, 16, -1, 3, 51, 30, 49, -48, -99, 57, -63, 29, 91, 87, 60, -43, -79, -12, -52, -42, 69, 89, 74, -50, 7, -46, -37, 34, -28, 66, -83, 31, -41, -87, -92, -11, -17, -9, 10, 98, 71, -93, -66, -20, 63, -51, 33, -47, 5, -97, 90, 45, -57, 61, -6, -53, 99, -61, -19, -77, 53, 23, -60, 56, -56, -36, -30, 28, 35, -38, 38, 62, -68, 22, -96, -73, -89, 50 }; validate_avl(sizeof(keys) / sizeof(*keys), keys); } static void test_bit_arrays(void) { // NB: This test is only valid for the 32 bit implementation of bit arrays. If we // were to change the chunk size of a bit array, we'd need to update these tests tsk_bitset_t arr; tsk_id_t items_truth[64] = { 0 }, items[64] = { 0 }; tsk_size_t n_items = 0, n_items_truth = 0; // test item retrieval tsk_bitset_init(&arr, 90, 1); CU_ASSERT_EQUAL_FATAL(arr.len, 1); CU_ASSERT_EQUAL_FATAL(arr.row_len, 3); tsk_bitset_get_items(&arr, 0, items, &n_items); assert_arrays_equal(n_items_truth, items, items_truth); for (tsk_bitset_val_t i = 0; i < 20; i++) { tsk_bitset_set_bit(&arr, 0, i); items_truth[n_items_truth] = (tsk_id_t) i; n_items_truth++; } tsk_bitset_set_bit(&arr, 0, 63); tsk_bitset_set_bit(&arr, 0, 65); // these assertions are only valid for 32-bit values CU_ASSERT_EQUAL_FATAL(arr.data[0], 1048575); CU_ASSERT_EQUAL_FATAL(arr.data[1], 2147483648); CU_ASSERT_EQUAL_FATAL(arr.data[2], 2); // verify our assumptions about bit array counting CU_ASSERT_EQUAL_FATAL(tsk_bitset_count(&arr, 0), 22); tsk_bitset_get_items(&arr, 0, items, &n_items); assert_arrays_equal(n_items_truth, items, items_truth); tsk_memset(items, 0, 64); tsk_memset(items_truth, 0, 64); n_items = n_items_truth = 0; tsk_bitset_free(&arr); // create a length-2 array with 64 bit capacity (two chunks per row) tsk_bitset_init(&arr, 64, 2); CU_ASSERT_EQUAL_FATAL(arr.len, 2); CU_ASSERT_EQUAL_FATAL(arr.row_len, 2); // fill the first 50 bits of the first row for (tsk_bitset_val_t i = 0; i < 50; i++) { tsk_bitset_set_bit(&arr, 0, i); items_truth[n_items_truth] = (tsk_id_t) i; n_items_truth++; } tsk_bitset_get_items(&arr, 0, items, &n_items); assert_arrays_equal(n_items_truth, items, items_truth); tsk_memset(items, 0, 64); tsk_memset(items_truth, 0, 64); n_items = n_items_truth = 0; // fill bits 20-40 of the second row for (tsk_bitset_val_t i = 20; i < 40; i++) { tsk_bitset_set_bit(&arr, 1, i); items_truth[n_items_truth] = (tsk_id_t) i; n_items_truth++; } tsk_bitset_get_items(&arr, 1, items, &n_items); assert_arrays_equal(n_items_truth, items, items_truth); tsk_memset(items, 0, 64); tsk_memset(items_truth, 0, 64); n_items = n_items_truth = 0; // verify our assumptions about row selection CU_ASSERT_EQUAL_FATAL(arr.data[0], 4294967295); // row1 elem1 CU_ASSERT_EQUAL_FATAL(arr.data[1], 262143); // row1 elem2 CU_ASSERT_EQUAL_FATAL(arr.data[2], 4293918720); // row2 elem1 CU_ASSERT_EQUAL_FATAL(arr.data[3], 255); // row2 elem2 // subtract the second from the first row, store in first tsk_bitset_subtract(&arr, 0, &arr, 1); // verify our assumptions about subtraction CU_ASSERT_EQUAL_FATAL(arr.data[0], 1048575); CU_ASSERT_EQUAL_FATAL(arr.data[1], 261888); tsk_bitset_t int_result; tsk_bitset_init(&int_result, 64, 1); CU_ASSERT_EQUAL_FATAL(int_result.len, 1); CU_ASSERT_EQUAL_FATAL(int_result.row_len, 2); // their intersection should be zero tsk_bitset_intersect(&arr, 0, &arr, 1, &int_result); CU_ASSERT_EQUAL_FATAL(int_result.data[0], 0); CU_ASSERT_EQUAL_FATAL(int_result.data[1], 0); // now, add them back together, storing back in a tsk_bitset_union(&arr, 0, &arr, 1); // now, their intersection should be the subtracted chunk (20-40) tsk_bitset_intersect(&arr, 0, &arr, 1, &int_result); CU_ASSERT_EQUAL_FATAL(int_result.data[0], 4293918720); CU_ASSERT_EQUAL_FATAL(int_result.data[1], 255); tsk_bitset_free(&int_result); tsk_bitset_free(&arr); } static void test_meson_version(void) { char version[100]; sprintf( version, "%d.%d.%d", TSK_VERSION_MAJOR, TSK_VERSION_MINOR, TSK_VERSION_PATCH); /* the MESON_PROJECT_VERSION define is passed in by meson when compiling */ CU_ASSERT_STRING_EQUAL(version, MESON_PROJECT_VERSION); } int main(int argc, char **argv) { CU_TestInfo tests[] = { { "test_strerror", test_strerror }, { "test_strerror_kastore", test_strerror_kastore }, { "test_generate_uuid", test_generate_uuid }, { "test_double_round", test_double_round }, { "test_blkalloc", test_blkalloc }, { "test_unknown_time", test_unknown_time }, { "test_malloc_zero", test_malloc_zero }, { "test_malloc_overflow", test_malloc_overflow }, { "test_debug_stream", test_debug_stream }, { "test_avl_empty", test_avl_empty }, { "test_avl_sequential", test_avl_sequential }, { "test_avl_interleaved", test_avl_interleaved }, { "test_avl_random", test_avl_random }, { "test_bit_arrays", test_bit_arrays }, { "test_meson_version", test_meson_version }, { NULL, NULL }, }; return test_main(tests, argc, argv); } ================================================ FILE: c/tests/test_file_format.c ================================================ /* * MIT License * * Copyright (c) 2019-2022 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, mergetest, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "testlib.h" #include typedef struct { const char *name; void *array; tsk_size_t len; int type; } write_table_col_t; static void write_table_cols(kastore_t *store, write_table_col_t *write_cols, size_t num_cols) { size_t j; int ret; for (j = 0; j < num_cols; j++) { ret = kastore_puts(store, write_cols[j].name, write_cols[j].array, (size_t) write_cols[j].len, write_cols[j].type, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); } } static void copy_store_drop_columns( tsk_treeseq_t *ts, size_t num_drop_cols, const char **drop_cols, const char *outfile) { int ret = 0; char tmpfile[] = "/tmp/tsk_c_test_copy_XXXXXX"; int fd; kastore_t read_store, write_store; kaitem_t *item; size_t j, k; bool keep; fd = mkstemp(tmpfile); CU_ASSERT_FATAL(fd != -1); close(fd); ret = tsk_treeseq_dump(ts, tmpfile, 0); if (ret != 0) { unlink(tmpfile); CU_ASSERT_EQUAL_FATAL(ret, 0); } ret = kastore_open(&read_store, tmpfile, "r", KAS_READ_ALL); /* We can now unlink the file as either kastore has read it all, or failed */ unlink(tmpfile); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_open(&write_store, outfile, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Note: this API is not a documented part of kastore, so may be subject to * change. */ for (j = 0; j < read_store.num_items; j++) { item = &read_store.items[j]; keep = true; for (k = 0; k < num_drop_cols; k++) { if (strlen(drop_cols[k]) == item->key_len && strncmp(drop_cols[k], item->key, item->key_len) == 0) { keep = false; break; } } if (keep) { ret = kastore_put(&write_store, item->key, item->key_len, item->array, item->array_len, item->type, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); } } kastore_close(&read_store); ret = kastore_close(&write_store); CU_ASSERT_EQUAL_FATAL(ret, 0); } static void test_format_data_load_errors(void) { size_t uuid_size = 36; char uuid[uuid_size]; char format_name[TSK_FILE_FORMAT_NAME_LENGTH]; double L[2]; uint32_t version[2] = { TSK_FILE_FORMAT_VERSION_MAJOR, TSK_FILE_FORMAT_VERSION_MINOR }; write_table_col_t write_cols[] = { { "format/name", (void *) format_name, sizeof(format_name), KAS_INT8 }, { "format/version", (void *) version, 2, KAS_UINT32 }, { "sequence_length", (void *) L, 1, KAS_FLOAT64 }, { "uuid", (void *) uuid, (tsk_size_t) uuid_size, KAS_INT8 }, }; tsk_table_collection_t tables; kastore_t store; size_t j; int ret; L[0] = 1; L[1] = 0; tsk_memcpy(format_name, TSK_FILE_FORMAT_NAME, sizeof(format_name)); /* Note: this will fail if we ever start parsing the form of the UUID */ tsk_memset(uuid, 0, uuid_size); ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); /* We've only defined the format headers, so we should fail immediately * after with required columns not found */ CU_ASSERT_FALSE(tsk_is_kas_error(ret)); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_REQUIRED_COL_NOT_FOUND); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Version too old */ version[0] = TSK_FILE_FORMAT_VERSION_MAJOR - 1; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_VERSION_TOO_OLD); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Version too new */ version[0] = TSK_FILE_FORMAT_VERSION_MAJOR + 1; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_VERSION_TOO_NEW); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); version[0] = TSK_FILE_FORMAT_VERSION_MAJOR; /* Bad version length */ write_cols[1].len = 0; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_FORMAT); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); write_cols[1].len = 2; /* Bad format name length */ write_cols[0].len = 0; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_FORMAT); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); write_cols[0].len = TSK_FILE_FORMAT_NAME_LENGTH; /* Bad format name */ format_name[0] = 'X'; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_FORMAT); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); format_name[0] = 't'; /* Bad type for sequence length. */ write_cols[2].type = KAS_FLOAT32; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_TRUE(tsk_is_kas_error(ret)); CU_ASSERT_EQUAL_FATAL(ret ^ (1 << TSK_KAS_ERR_BIT), KAS_ERR_TYPE_MISMATCH); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); write_cols[2].type = KAS_FLOAT64; /* Bad length for sequence length. */ write_cols[2].len = 2; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_FORMAT); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); write_cols[2].len = 1; /* Bad value for sequence length. */ L[0] = -1; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); L[0] = 1; /* Wrong length for uuid */ write_cols[3].len = 1; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_FORMAT); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); write_cols[3].len = (tsk_size_t) uuid_size; /* Missing keys */ for (j = 0; j < sizeof(write_cols) / sizeof(*write_cols) - 1; j++) { ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, j); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_REQUIRED_COL_NOT_FOUND); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); } } static void test_missing_optional_column_pairs(void) { int ret; size_t j; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t t1, t2; const char *required_cols[][2] = { { "edges/metadata", "edges/metadata_offset" }, { "migrations/metadata", "migrations/metadata_offset" }, { "individuals/parents", "individuals/parents_offset" } }; const char *drop_cols[2]; ret = tsk_treeseq_copy_tables(ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < sizeof(required_cols) / sizeof(*required_cols); j++) { drop_cols[0] = required_cols[j][0]; copy_store_drop_columns(ts, 1, drop_cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BOTH_COLUMNS_REQUIRED); tsk_table_collection_free(&t2); drop_cols[0] = required_cols[j][1]; copy_store_drop_columns(ts, 1, drop_cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BOTH_COLUMNS_REQUIRED); tsk_table_collection_free(&t2); drop_cols[0] = required_cols[j][0]; drop_cols[1] = required_cols[j][1]; copy_store_drop_columns(ts, 2, drop_cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); } tsk_table_collection_free(&t1); tsk_treeseq_free(ts); free(ts); } static void test_missing_required_column_pairs(void) { int ret; size_t j; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t t; const char *required_cols[][2] = { { "individuals/location", "individuals/location_offset" }, { "individuals/metadata", "individuals/metadata_offset" }, { "mutations/derived_state", "mutations/derived_state_offset" }, { "mutations/metadata", "mutations/metadata_offset" }, { "nodes/metadata", "nodes/metadata_offset" }, { "populations/metadata", "populations/metadata_offset" }, { "provenances/record", "provenances/record_offset" }, { "provenances/timestamp", "provenances/timestamp_offset" }, { "sites/ancestral_state", "sites/ancestral_state_offset" }, { "sites/metadata", "sites/metadata_offset" }, }; const char *drop_cols[2]; for (j = 0; j < sizeof(required_cols) / sizeof(*required_cols); j++) { drop_cols[0] = required_cols[j][0]; copy_store_drop_columns(ts, 1, drop_cols, _tmp_file_name); ret = tsk_table_collection_load(&t, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_REQUIRED_COL_NOT_FOUND); tsk_table_collection_free(&t); drop_cols[0] = required_cols[j][1]; copy_store_drop_columns(ts, 1, drop_cols, _tmp_file_name); ret = tsk_table_collection_load(&t, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BOTH_COLUMNS_REQUIRED); tsk_table_collection_free(&t); copy_store_drop_columns(ts, 2, required_cols[j], _tmp_file_name); ret = tsk_table_collection_load(&t, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_REQUIRED_COL_NOT_FOUND); tsk_table_collection_free(&t); } tsk_treeseq_free(ts); free(ts); } static void verify_bad_offset_columns(tsk_treeseq_t *ts, const char *offset_col) { int ret = 0; kastore_t store; tsk_table_collection_t tables; uint32_t *offset_array, *offset_copy; size_t offset_len; int type; uint32_t data_len; ret = tsk_treeseq_dump(ts, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_open(&store, _tmp_file_name, "r", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_gets(&store, offset_col, (void **) &offset_array, &offset_len, &type); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(type, KAS_UINT32); offset_copy = malloc(offset_len * sizeof(*offset_array)); CU_ASSERT_FATAL(offset_copy != NULL); tsk_memcpy(offset_copy, offset_array, offset_len * sizeof(*offset_array)); data_len = offset_array[offset_len - 1]; CU_ASSERT_TRUE(data_len > 0); kastore_close(&store); offset_copy[0] = UINT32_MAX; copy_store_drop_columns(ts, 1, &offset_col, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, offset_col, offset_copy, offset_len, KAS_UINT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_OFFSET); tsk_table_collection_free(&tables); offset_copy[0] = 0; offset_copy[offset_len - 1] = 0; copy_store_drop_columns(ts, 1, &offset_col, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, offset_col, offset_copy, offset_len, KAS_UINT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_OFFSET); tsk_table_collection_free(&tables); offset_copy[offset_len - 1] = data_len + 1; copy_store_drop_columns(ts, 1, &offset_col, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, offset_col, offset_copy, offset_len, KAS_UINT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_OFFSET); tsk_table_collection_free(&tables); copy_store_drop_columns(ts, 1, &offset_col, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, offset_col, NULL, 0, KAS_UINT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_FORMAT); tsk_table_collection_free(&tables); copy_store_drop_columns(ts, 1, &offset_col, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, offset_col, offset_copy, offset_len, KAS_FLOAT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_COLUMN_TYPE); tsk_table_collection_free(&tables); free(offset_copy); } static void test_bad_offset_columns(void) { size_t j; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); /* We exclude "provenances/timestamp_offset" here because there are no * non-ragged columns in the provenances table, so this doesn't quite * fit into the same pattern as the other tables */ const char *cols[] = { "edges/metadata_offset", "migrations/metadata_offset", "individuals/location_offset", "individuals/parents_offset", "individuals/metadata_offset", "mutations/derived_state_offset", "mutations/metadata_offset", "nodes/metadata_offset", "populations/metadata_offset", "provenances/record_offset", "sites/ancestral_state_offset", "sites/metadata_offset", }; for (j = 0; j < sizeof(cols) / sizeof(*cols); j++) { verify_bad_offset_columns(ts, cols[j]); } tsk_treeseq_free(ts); free(ts); } static void test_force_offset_64(void) { int ret; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t t1; tsk_table_collection_t t2; kastore_t store; kaitem_t *item; const char *suffix; const char *offset_str = "_offset"; int num_found = 0; size_t j; ret = tsk_treeseq_dump(ts, _tmp_file_name, TSK_DUMP_FORCE_OFFSET_64); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_open(&store, _tmp_file_name, "r", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < store.num_items; j++) { item = &store.items[j]; /* Does the key end in "_offset"? */ if (item->key_len > strlen(offset_str)) { suffix = item->key + (item->key_len - strlen(offset_str)); if (strncmp(suffix, offset_str, strlen(offset_str)) == 0) { CU_ASSERT_EQUAL(item->type, KAS_UINT64); num_found++; } } } CU_ASSERT_TRUE(num_found > 0); kastore_close(&store); ret = tsk_table_collection_load(&t1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_copy_tables(ts, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t1); tsk_table_collection_free(&t2); tsk_treeseq_free(ts); free(ts); } static void test_missing_indexes(void) { int ret; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t t1, t2; const char *cols[] = { "indexes/edge_insertion_order", "indexes/edge_removal_order" }; const char *drop_cols[2]; ret = tsk_treeseq_copy_tables(ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); drop_cols[0] = cols[0]; copy_store_drop_columns(ts, 1, drop_cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BOTH_COLUMNS_REQUIRED); tsk_table_collection_free(&t2); drop_cols[0] = cols[1]; copy_store_drop_columns(ts, 1, drop_cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BOTH_COLUMNS_REQUIRED); tsk_table_collection_free(&t2); copy_store_drop_columns(ts, 2, cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); CU_ASSERT_FALSE(tsk_table_collection_has_index(&t2, 0)); tsk_table_collection_free(&t2); tsk_table_collection_free(&t1); tsk_treeseq_free(ts); free(ts); } static void test_malformed_indexes(void) { int ret; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t tables; tsk_treeseq_t ts2; tsk_size_t num_edges = tsk_treeseq_get_num_edges(ts); tsk_id_t *bad_index = tsk_calloc(num_edges, sizeof(tsk_id_t)); tsk_id_t *good_index = tsk_calloc(num_edges, sizeof(tsk_id_t)); kastore_t store; const char *cols[] = { "indexes/edge_insertion_order", "indexes/edge_removal_order" }; CU_ASSERT_FATAL(bad_index != NULL); CU_ASSERT_FATAL(good_index != NULL); /* If both columns are not the same length as the number of edges we * should raise an error */ copy_store_drop_columns(ts, 2, cols, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, cols[0], NULL, 0, TSK_ID_STORAGE_TYPE, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, cols[1], NULL, 0, TSK_ID_STORAGE_TYPE, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_FORMAT); tsk_table_collection_free(&tables); bad_index[0] = -1; copy_store_drop_columns(ts, 2, cols, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts( &store, cols[0], good_index, (size_t) num_edges, TSK_ID_STORAGE_TYPE, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts( &store, cols[1], bad_index, (size_t) num_edges, TSK_ID_STORAGE_TYPE, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_load(&ts2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts2); copy_store_drop_columns(ts, 2, cols, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts( &store, cols[0], bad_index, (size_t) num_edges, TSK_ID_STORAGE_TYPE, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts( &store, cols[1], good_index, (size_t) num_edges, TSK_ID_STORAGE_TYPE, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_load(&ts2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts2); copy_store_drop_columns(ts, 1, cols, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, cols[0], bad_index, (size_t) num_edges, KAS_FLOAT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_load(&ts2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_COLUMN_TYPE); tsk_treeseq_free(&ts2); free(good_index); free(bad_index); tsk_treeseq_free(ts); free(ts); } static void test_missing_reference_sequence(void) { int ret; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t t1, t2; const char *cols[] = { "reference_sequence/data", "reference_sequence/url", "reference_sequence/metadata_schema", "reference_sequence/metadata" }; CU_ASSERT_TRUE(tsk_treeseq_has_reference_sequence(ts)); ret = tsk_treeseq_copy_tables(ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); copy_store_drop_columns(ts, 1, cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_TRUE(tsk_table_collection_has_reference_sequence(&t2)); tsk_table_collection_free(&t2); copy_store_drop_columns(ts, 2, cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_TRUE(tsk_table_collection_has_reference_sequence(&t2)); tsk_table_collection_free(&t2); copy_store_drop_columns(ts, 3, cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_TRUE(tsk_table_collection_has_reference_sequence(&t2)); tsk_table_collection_free(&t2); /* Dropping all the columns gives us a NULL reference_sequence, though */ copy_store_drop_columns(ts, 4, cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_has_reference_sequence(&t2)); tsk_table_collection_free(&t2); tsk_table_collection_free(&t1); tsk_treeseq_free(ts); free(ts); } static void test_bad_column_types(void) { int ret; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t tables; tsk_size_t num_edges = tsk_treeseq_get_num_edges(ts); /* make sure we have enough memory in all cases */ tsk_id_t *col_memory = tsk_calloc(num_edges + 1, sizeof(double)); kastore_t store; const char *cols[1]; CU_ASSERT_FATAL(col_memory != NULL); cols[0] = "edges/left"; copy_store_drop_columns(ts, 1, cols, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, cols[0], col_memory, (size_t) num_edges, KAS_FLOAT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_COLUMN_TYPE); tsk_table_collection_free(&tables); cols[0] = "edges/metadata_offset"; copy_store_drop_columns(ts, 1, cols, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts( &store, cols[0], col_memory, (size_t) num_edges + 1, KAS_FLOAT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_COLUMN_TYPE); tsk_table_collection_free(&tables); cols[0] = "edges/metadata"; copy_store_drop_columns(ts, 1, cols, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, cols[0], NULL, 0, KAS_FLOAT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_COLUMN_TYPE); tsk_table_collection_free(&tables); cols[0] = "edges/metadata_schema"; copy_store_drop_columns(ts, 1, cols, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, cols[0], NULL, 0, KAS_FLOAT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_COLUMN_TYPE); tsk_table_collection_free(&tables); cols[0] = "reference_sequence/metadata"; copy_store_drop_columns(ts, 1, cols, _tmp_file_name); ret = kastore_open(&store, _tmp_file_name, "a", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_puts(&store, cols[0], NULL, 0, KAS_FLOAT32, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_COLUMN_TYPE); tsk_table_collection_free(&tables); free(col_memory); tsk_treeseq_free(ts); free(ts); } static void test_missing_required_columns(void) { int ret; size_t j; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t t; const char *required_cols[] = { "edges/child", "edges/left", "edges/parent", "edges/right", "format/name", "format/version", "individuals/flags", "migrations/dest", "migrations/left", "migrations/node", "migrations/right", "migrations/source", "migrations/time", "mutations/node", "mutations/parent", "mutations/site", "nodes/flags", "nodes/individual", "nodes/population", "nodes/time", "sequence_length", "sites/position", "uuid", }; const char *drop_cols[1]; for (j = 0; j < sizeof(required_cols) / sizeof(*required_cols); j++) { drop_cols[0] = required_cols[j]; copy_store_drop_columns(ts, 1, drop_cols, _tmp_file_name); ret = tsk_table_collection_load(&t, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_REQUIRED_COL_NOT_FOUND); tsk_table_collection_free(&t); } tsk_treeseq_free(ts); free(ts); } static void test_metadata_schemas_optional(void) { int ret; size_t j; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t t1, t2; const char *cols[] = { "metadata", "metadata_schema", "reference_sequence/metadata", "reference_sequence/metadata_schema", "individuals/metadata_schema", "populations/metadata_schema", "nodes/metadata_schema", "edges/metadata_schema", "sites/metadata_schema", "mutations/metadata_schema", "migrations/metadata_schema", }; const char *drop_cols[1]; ret = tsk_treeseq_copy_tables(ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < sizeof(cols) / sizeof(*cols); j++) { drop_cols[0] = cols[j]; copy_store_drop_columns(ts, 1, drop_cols, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* metadata schemas are included in data comparisons */ CU_ASSERT_FALSE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); } tsk_table_collection_free(&t1); tsk_treeseq_free(ts); free(ts); } /* This test is problematic on windows because of the different off_t * types. Doesn't seem worth the trouble of getting it working. */ static void test_load_bad_file_formats(void) { #if !defined(_WIN32) tsk_table_collection_t tables; tsk_treeseq_t ts; int ret, ret2; off_t offset; FILE *f; /* A zero byte file is TSK_ERR_EOF */ f = fopen(_tmp_file_name, "w+"); ret = tsk_table_collection_loadf(&tables, f, 0); ret2 = tsk_treeseq_loadf(&ts, f, 0); CU_ASSERT_EQUAL_FATAL(ret, ret2); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); fclose(f); for (offset = 1; offset < 100; offset++) { ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; ret = tsk_table_collection_dump(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret2 = truncate(_tmp_file_name, offset); CU_ASSERT_EQUAL_FATAL(ret2, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret ^ (1 << TSK_KAS_ERR_BIT), KAS_ERR_BAD_FILE_FORMAT); tsk_table_collection_free(&tables); } #endif } static void test_load_errors(void) { tsk_table_collection_t tables; tsk_treeseq_t ts; int ret, ret2; const char *str; FILE *f; ret = tsk_table_collection_load(&tables, "/", 0); ret2 = tsk_treeseq_load(&ts, "/", 0); CU_ASSERT_EQUAL_FATAL(ret, ret2); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); str = tsk_strerror(ret); CU_ASSERT_TRUE(strlen(str) > 0); CU_ASSERT_STRING_EQUAL(str, strerror(EISDIR)); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); ret = tsk_table_collection_load(&tables, "/bin/theres_no_way_this_file_exists", 0); ret2 = tsk_treeseq_load(&ts, "/bin/theres_no_way_this_file_exists", 0); CU_ASSERT_EQUAL_FATAL(ret, ret2); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); str = tsk_strerror(ret); CU_ASSERT_TRUE(strlen(str) > 0); CU_ASSERT_STRING_EQUAL(str, strerror(ENOENT)); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); ret = tsk_table_collection_load(&tables, "/bin/sh", 0); ret2 = tsk_treeseq_load(&ts, "/bin/sh", 0); CU_ASSERT_EQUAL_FATAL(ret, ret2); CU_ASSERT_TRUE(tsk_is_kas_error(ret)); CU_ASSERT_EQUAL_FATAL(ret ^ (1 << TSK_KAS_ERR_BIT), KAS_ERR_BAD_FILE_FORMAT); str = tsk_strerror(ret); CU_ASSERT_TRUE(strlen(str) > 0); tsk_table_collection_free(&tables); /* open a file in the wrong mode */ f = fopen(_tmp_file_name, "w"); ret = tsk_table_collection_loadf(&tables, f, 0); ret2 = tsk_treeseq_loadf(&ts, f, 0); CU_ASSERT_EQUAL_FATAL(ret, ret2); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); str = tsk_strerror(ret); CU_ASSERT_TRUE(strlen(str) > 0); CU_ASSERT_STRING_EQUAL(str, strerror(EBADF)); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); fclose(f); } static void test_load_eof(void) { tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t tables; int ret; FILE *f; f = fopen(_tmp_file_name, "w+"); CU_ASSERT_NOT_EQUAL(f, NULL); ret = tsk_table_collection_loadf(&tables, f, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); fclose(f); tsk_table_collection_free(&tables); /* Reading an empty file also returns EOF */ ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); tsk_table_collection_free(&tables); f = fopen(_tmp_file_name, "w+"); CU_ASSERT_NOT_EQUAL(f, NULL); ret = tsk_treeseq_dumpf(ts, f, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Reading from the end of the stream gives EOF */ ret = tsk_table_collection_loadf(&tables, f, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); tsk_table_collection_free(&tables); /* Reading the start of the stream is fine */ fseek(f, 0, SEEK_SET); ret = tsk_table_collection_loadf(&tables, f, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tables); /* And we should be back to the end of the stream */ ret = tsk_table_collection_loadf(&tables, f, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); tsk_table_collection_free(&tables); /* Trying to read the same end stream should give the same * result. */ ret = tsk_table_collection_loadf(&tables, f, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); tsk_table_collection_free(&tables); /* A previously init'd tables should be good too */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_loadf(&tables, f, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); tsk_table_collection_free(&tables); fclose(f); tsk_treeseq_free(ts); free(ts); } static void test_dump_errors(void) { tsk_table_collection_t tables; int ret; FILE *f; const char *str; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; ret = tsk_table_collection_dump(&tables, "/", 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); str = tsk_strerror(ret); CU_ASSERT_TRUE(strlen(str) > 0); CU_ASSERT_STRING_EQUAL(str, strerror(EISDIR)); /* We're assuming that we don't have write access to /bin, so don't run this * as root! */ ret = tsk_table_collection_dump(&tables, "/bin/theres_no_way_this_file_exists", 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); str = tsk_strerror(ret); CU_ASSERT_TRUE(strlen(str) > 0); CU_ASSERT_TRUE( (strcmp(str, strerror(EACCES)) == 0) || (strcmp(str, strerror(EPERM)) == 0)); /* open a file in the wrong mode */ f = fopen(_tmp_file_name, "r"); ret = tsk_table_collection_dumpf(&tables, f, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); str = tsk_strerror(ret); CU_ASSERT_TRUE(strlen(str) > 0); CU_ASSERT_STRING_EQUAL(str, strerror(EBADF)); fclose(f); /* We'd like to catch close errors also, but it's hard to provoke them * without intercepting calls to fclose() */ tsk_table_collection_free(&tables); } /* FIXME these are good tests, but we want to make them more general so that * they can be applied to other tables.*/ static void test_load_node_table_errors(void) { char format_name[TSK_FILE_FORMAT_NAME_LENGTH]; size_t uuid_size = 36; char uuid[uuid_size]; double L = 1; double time = 0; double flags = 0; tsk_id_t population = 0; tsk_id_t individual = 0; int8_t metadata = 0; uint32_t metadata_offset[] = { 0, 1 }; uint32_t version[2] = { TSK_FILE_FORMAT_VERSION_MAJOR, TSK_FILE_FORMAT_VERSION_MINOR }; write_table_col_t write_cols[] = { { "nodes/time", (void *) &time, 1, KAS_FLOAT64 }, { "nodes/flags", (void *) &flags, 1, TSK_FLAGS_STORAGE_TYPE }, { "nodes/population", (void *) &population, 1, TSK_ID_STORAGE_TYPE }, { "nodes/individual", (void *) &individual, 1, TSK_ID_STORAGE_TYPE }, { "nodes/metadata", (void *) &metadata, 1, KAS_UINT8 }, { "nodes/metadata_offset", (void *) metadata_offset, 2, KAS_UINT32 }, { "format/name", (void *) format_name, sizeof(format_name), KAS_INT8 }, { "format/version", (void *) version, 2, KAS_UINT32 }, { "uuid", (void *) uuid, uuid_size, KAS_INT8 }, { "sequence_length", (void *) &L, 1, KAS_FLOAT64 }, }; tsk_table_collection_t tables; kastore_t store; int ret; tsk_memcpy(format_name, TSK_FILE_FORMAT_NAME, sizeof(format_name)); /* Note: this will fail if we ever start parsing the form of the UUID */ tsk_memset(uuid, 0, uuid_size); ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); /* We've only defined the format headers and nodes, so we should fail immediately * after with key not found */ CU_ASSERT_FALSE(tsk_is_kas_error(ret)); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_REQUIRED_COL_NOT_FOUND); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Wrong type for time */ write_cols[0].type = KAS_INT64; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_COLUMN_TYPE); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); write_cols[0].type = KAS_FLOAT64; /* Wrong length for flags */ write_cols[1].len = 0; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_FORMAT); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); write_cols[1].len = 1; /* Wrong length for metadata offset */ write_cols[5].len = 1; ret = kastore_open(&store, _tmp_file_name, "w", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); write_table_cols(&store, write_cols, sizeof(write_cols) / sizeof(*write_cols)); ret = kastore_close(&store); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_FILE_FORMAT); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); write_cols[5].len = 2; } static void test_example_round_trip(void) { int ret; tsk_treeseq_t *ts1 = caterpillar_tree(5, 3, 3); tsk_treeseq_t ts2; tsk_table_collection_t t1, t2; FILE *f; ret = tsk_treeseq_copy_tables(ts1, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_dump(&t1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); /* Reading multiple times into the same tables with TSK_NO_INIT is supported. */ ret = tsk_table_collection_load(&t2, _tmp_file_name, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); /* Do the same thing with treeseq API */ remove(_tmp_file_name); ret = tsk_treeseq_dump(ts1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_load(&ts2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0)); tsk_treeseq_free(&ts2); /* Use loadf form */ f = fopen(_tmp_file_name, "w+"); ret = tsk_table_collection_dumpf(&t1, f, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); fseek(f, 0, SEEK_SET); ret = tsk_table_collection_loadf(&t2, f, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); fclose(f); /* Do the same thing with treeseq API */ f = fopen(_tmp_file_name, "w+"); ret = tsk_treeseq_dumpf(ts1, f, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); fseek(f, 0, SEEK_SET); ret = tsk_treeseq_loadf(&ts2, f, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0)); tsk_treeseq_free(&ts2); fclose(f); tsk_table_collection_free(&t1); tsk_treeseq_free(ts1); free(ts1); } static void test_multiple_round_trip(void) { int ret; tsk_size_t j; tsk_size_t num_examples = 10; tsk_treeseq_t *ts; tsk_table_collection_t in_tables[num_examples]; tsk_table_collection_t out_tables; FILE *f = fopen(_tmp_file_name, "w+"); CU_ASSERT_NOT_EQUAL_FATAL(f, NULL); for (j = 0; j < num_examples; j++) { ts = caterpillar_tree(5 + j, 3 + j, 3 + j); ret = tsk_treeseq_copy_tables(ts, &in_tables[j], 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_dumpf(ts, f, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(ts); free(ts); } fseek(f, 0, SEEK_SET); for (j = 0; j < num_examples; j++) { ret = tsk_table_collection_loadf(&out_tables, f, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&in_tables[j], &out_tables, 0)); tsk_table_collection_free(&out_tables); } /* Can do the same with the same set of previously init'd tables. */ ret = tsk_table_collection_init(&out_tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); fseek(f, 0, SEEK_SET); for (j = 0; j < num_examples; j++) { ret = tsk_table_collection_loadf(&out_tables, f, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&in_tables[j], &out_tables, 0)); } tsk_table_collection_free(&out_tables); /* Can also read until EOF to do the same thing */ ret = tsk_table_collection_init(&out_tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); fseek(f, 0, SEEK_SET); j = 0; while (true) { ret = tsk_table_collection_loadf(&out_tables, f, TSK_NO_INIT); if (ret == TSK_ERR_EOF) { break; } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&in_tables[j], &out_tables, 0)); j++; } tsk_table_collection_free(&out_tables); CU_ASSERT_EQUAL_FATAL(j, num_examples); for (j = 0; j < num_examples; j++) { tsk_table_collection_free(&in_tables[j]); } fclose(f); } static void test_copy_store_drop_columns(void) { int ret; tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); tsk_table_collection_t t1, t2; ret = tsk_treeseq_copy_tables(ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Dropping no columns should have no effect on the data */ copy_store_drop_columns(ts, 0, NULL, _tmp_file_name); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t1); tsk_table_collection_free(&t2); tsk_treeseq_free(ts); free(ts); } static void test_skip_tables(void) { int ret; tsk_treeseq_t *ts1 = caterpillar_tree(5, 3, 3); tsk_treeseq_t ts2; tsk_table_collection_t t1, t2; FILE *f; ret = tsk_treeseq_dump(ts1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&t1, _tmp_file_name, TSK_LOAD_SKIP_TABLES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts1->tables, TSK_CMP_IGNORE_TABLES)); CU_ASSERT_EQUAL(t1.individuals.num_rows, 0); CU_ASSERT_EQUAL(t1.nodes.num_rows, 0); CU_ASSERT_EQUAL(t1.edges.num_rows, 0); CU_ASSERT_EQUAL(t1.migrations.num_rows, 0); CU_ASSERT_EQUAL(t1.sites.num_rows, 0); CU_ASSERT_EQUAL(t1.mutations.num_rows, 0); CU_ASSERT_EQUAL(t1.provenances.num_rows, 0); /* Test _loadf code path as well */ f = fopen(_tmp_file_name, "r+"); ret = tsk_table_collection_loadf(&t2, f, TSK_LOAD_SKIP_TABLES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); fclose(f); tsk_table_collection_free(&t2); /* Without TSK_LOAD_SKIP_TABLES we reach end of file */ f = fopen(_tmp_file_name, "r+"); ret = tsk_table_collection_loadf(&t2, f, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(fgetc(f), EOF); fclose(f); tsk_table_collection_free(&t2); /* Setting TSK_LOAD_SKIP_TABLES only reads part of the file */ f = fopen(_tmp_file_name, "r+"); ret = tsk_table_collection_loadf(&t2, f, TSK_LOAD_SKIP_TABLES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NOT_EQUAL(fgetc(f), EOF); fclose(f); tsk_table_collection_free(&t2); /* We should be able to make a tree sequence */ ret = tsk_treeseq_init(&ts2, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts2); /* Do the same thing with treeseq API */ ret = tsk_treeseq_load(&ts2, _tmp_file_name, TSK_LOAD_SKIP_TABLES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0)); tsk_treeseq_free(&ts2); f = fopen(_tmp_file_name, "r+"); ret = tsk_treeseq_loadf(&ts2, f, TSK_LOAD_SKIP_TABLES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0)); fclose(f); tsk_treeseq_free(&ts2); tsk_table_collection_free(&t1); tsk_treeseq_free(ts1); free(ts1); } static void test_skip_reference_sequence(void) { int ret; tsk_treeseq_t *ts1 = caterpillar_tree(5, 3, 3); tsk_treeseq_t ts2; tsk_table_collection_t t1, t2; FILE *f; CU_ASSERT_TRUE(tsk_treeseq_has_reference_sequence(ts1)); ret = tsk_treeseq_dump(ts1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load( &t1, _tmp_file_name, TSK_LOAD_SKIP_REFERENCE_SEQUENCE); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&t1, ts1->tables, 0)); CU_ASSERT_TRUE(tsk_table_collection_equals( &t1, ts1->tables, TSK_CMP_IGNORE_REFERENCE_SEQUENCE)); CU_ASSERT_FALSE(tsk_table_collection_has_reference_sequence(&t1)); /* Test _loadf code path as well */ f = fopen(_tmp_file_name, "r+"); ret = tsk_table_collection_loadf(&t2, f, TSK_LOAD_SKIP_REFERENCE_SEQUENCE); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); fclose(f); tsk_table_collection_free(&t2); /* Setting TSK_LOAD_SKIP_REFERENCE_SEQUENCE only reads part of the file */ f = fopen(_tmp_file_name, "r+"); ret = tsk_table_collection_loadf(&t2, f, TSK_LOAD_SKIP_REFERENCE_SEQUENCE); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NOT_EQUAL(fgetc(f), EOF); fclose(f); tsk_table_collection_free(&t2); /* We should be able to make a tree sequence */ ret = tsk_treeseq_init(&ts2, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts2); /* Do the same thing with treeseq API */ ret = tsk_treeseq_load(&ts2, _tmp_file_name, TSK_LOAD_SKIP_REFERENCE_SEQUENCE); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0)); tsk_treeseq_free(&ts2); f = fopen(_tmp_file_name, "r+"); ret = tsk_treeseq_loadf(&ts2, f, TSK_LOAD_SKIP_REFERENCE_SEQUENCE); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0)); fclose(f); tsk_treeseq_free(&ts2); tsk_table_collection_free(&t1); tsk_treeseq_free(ts1); free(ts1); } int main(int argc, char **argv) { CU_TestInfo tests[] = { { "test_format_data_load_errors", test_format_data_load_errors }, { "test_missing_indexes", test_missing_indexes }, { "test_malformed_indexes", test_malformed_indexes }, { "test_missing_reference_sequence", test_missing_reference_sequence }, { "test_bad_column_types", test_bad_column_types }, { "test_missing_required_columns", test_missing_required_columns }, { "test_missing_optional_column_pairs", test_missing_optional_column_pairs }, { "test_missing_required_column_pairs", test_missing_required_column_pairs }, { "test_bad_offset_columns", test_bad_offset_columns }, { "test_force_offset_64", test_force_offset_64 }, { "test_metadata_schemas_optional", test_metadata_schemas_optional }, { "test_load_node_table_errors", test_load_node_table_errors }, { "test_load_bad_file_formats", test_load_bad_file_formats }, { "test_load_errors", test_load_errors }, { "test_load_eof", test_load_eof }, { "test_dump_errors", test_dump_errors }, { "test_example_round_trip", test_example_round_trip }, { "test_multiple_round_trip", test_multiple_round_trip }, { "test_copy_store_drop_columns", test_copy_store_drop_columns }, { "test_skip_tables", test_skip_tables }, { "test_skip_reference_sequence", test_skip_reference_sequence }, { NULL, NULL }, }; return test_main(tests, argc, argv); } ================================================ FILE: c/tests/test_genotypes.c ================================================ /* * MIT License * * Copyright (c) 2019-2022 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "testlib.h" #include #include #include #include static void test_simplest_missing_data(void) { const char *nodes = "1 0 0\n" "1 0 0\n"; const char *sites = "0.0 A\n"; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; int ret; tsk_treeseq_from_text(&ts, 1, nodes, "", NULL, sites, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 1); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_TRUE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); CU_ASSERT_EQUAL(var->genotypes[1], TSK_MISSING_DATA); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } static void test_simplest_missing_data_user_alleles(void) { const char *nodes = "1 0 0\n" "1 0 0\n"; const char *sites = "0.0 A\n"; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; const char *alleles[] = { "A", NULL }; int ret; tsk_id_t samples[] = { 0 }; tsk_treeseq_from_text(&ts, 1, nodes, "", NULL, sites, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 1); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, alleles, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_TRUE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); CU_ASSERT_EQUAL(var->genotypes[1], TSK_MISSING_DATA); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); ret = tsk_vargen_init(&vargen, &ts, samples, 1, alleles, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_TRUE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } static void test_simplest_missing_data_mutations(void) { const char *nodes = "1 0 0\n" "1 0 0\n"; const char *sites = "0.0 A\n"; const char *mutations = "0 0 T -1\n"; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; const char *alleles[] = { "A", "T", NULL }; int ret; tsk_id_t samples[] = { 0 }; tsk_treeseq_from_text(&ts, 1, nodes, "", NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 1); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, alleles, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_TRUE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], TSK_MISSING_DATA); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); ret = tsk_vargen_init(&vargen, &ts, samples, 1, alleles, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } static void test_simplest_missing_data_mutations_all_samples(void) { const char *nodes = "1 0 0\n" "1 0 0\n"; const char *sites = "0.0 A\n"; const char *mutations = "0 0 T -1\n" "0 1 T -1\n"; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; const char *alleles[] = { "A", "T", NULL }; int ret; tsk_id_t samples[] = { 0, 1 }; tsk_treeseq_from_text(&ts, 1, nodes, "", NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 2); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, alleles, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); ret = tsk_vargen_init(&vargen, &ts, samples, 2, alleles, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } static void test_single_tree_user_alleles(void) { int ret = 0; const char *sites = "0.0 G\n" "0.125 A\n" "0.25 C\n" "0.5 A\n"; const char *mutations = "0 0 T -1\n" "1 1 C -1\n" "2 0 G -1\n" "2 1 A -1\n" "2 2 T -1\n" // A bunch of different sample mutations "3 4 T -1\n" "3 0 A 5\n"; // A back mutation from T -> A tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; const char *alleles[] = { "A", "C", "G", "T", NULL }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, alleles, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_EQUAL_FATAL(var->num_alleles, 4); CU_ASSERT_EQUAL(var->allele_lengths[0], 1); CU_ASSERT_EQUAL(var->allele_lengths[1], 1); CU_ASSERT_EQUAL(var->allele_lengths[2], 1); CU_ASSERT_EQUAL(var->allele_lengths[3], 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "C", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 3); CU_ASSERT_EQUAL(var->genotypes[1], 2); CU_ASSERT_EQUAL(var->genotypes[2], 2); CU_ASSERT_EQUAL(var->genotypes[3], 2); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.125); CU_ASSERT_EQUAL(var->num_alleles, 4); CU_ASSERT_EQUAL(var->allele_lengths[0], 1); CU_ASSERT_EQUAL(var->allele_lengths[1], 1); CU_ASSERT_EQUAL(var->allele_lengths[2], 1); CU_ASSERT_EQUAL(var->allele_lengths[3], 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "C", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 1); CU_ASSERT_EQUAL(var->genotypes[2], 0); CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.25); CU_ASSERT_EQUAL(var->num_alleles, 4); CU_ASSERT_EQUAL(var->allele_lengths[0], 1); CU_ASSERT_EQUAL(var->allele_lengths[1], 1); CU_ASSERT_EQUAL(var->allele_lengths[2], 1); CU_ASSERT_EQUAL(var->allele_lengths[3], 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "C", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 2); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->genotypes[2], 3); CU_ASSERT_EQUAL(var->genotypes[3], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.5); CU_ASSERT_EQUAL(var->num_alleles, 4); CU_ASSERT_EQUAL(var->allele_lengths[0], 1); CU_ASSERT_EQUAL(var->allele_lengths[1], 1); CU_ASSERT_EQUAL(var->allele_lengths[2], 1); CU_ASSERT_EQUAL(var->allele_lengths[3], 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "C", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "G", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 3); CU_ASSERT_EQUAL(var->genotypes[2], 0); CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } static void test_single_tree_char_alphabet(void) { int ret = 0; const char *sites = "0.0 A\n" "0.125 A\n" "0.25 C\n" "0.5 A\n"; const char *mutations = "0 0 T -1\n" "1 1 TTTAAGGG -1\n" "2 0 G -1\n" "2 1 AT -1\n" "2 2 T -1\n" // A bunch of different sample mutations "3 4 T -1\n" "3 0 A 5\n"; // A back mutation from T -> A tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_EQUAL(var->allele_lengths[0], 1); CU_ASSERT_EQUAL(var->allele_lengths[1], 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->genotypes[2], 0); CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.125); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_EQUAL(var->allele_lengths[0], 1); CU_ASSERT_EQUAL(var->allele_lengths[1], 8); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "TTTAAGGG", 8); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 1); CU_ASSERT_EQUAL(var->genotypes[2], 0); CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.25); CU_ASSERT_EQUAL(var->num_alleles, 4); CU_ASSERT_EQUAL(var->allele_lengths[0], 1); CU_ASSERT_EQUAL(var->allele_lengths[1], 1); CU_ASSERT_EQUAL(var->allele_lengths[2], 2); CU_ASSERT_EQUAL(var->allele_lengths[3], 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "C", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "G", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[2], "AT", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[3], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], 2); CU_ASSERT_EQUAL(var->genotypes[2], 3); CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->site.position, 0.5); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_EQUAL(var->allele_lengths[0], 1); CU_ASSERT_EQUAL(var->allele_lengths[1], 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "A", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "T", 1); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 1); CU_ASSERT_EQUAL(var->genotypes[2], 0); CU_ASSERT_EQUAL(var->genotypes[3], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } static void test_single_tree_binary_alphabet(void) { int ret = 0; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->genotypes[2], 1); CU_ASSERT_EQUAL(var->genotypes[3], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 0); CU_ASSERT_EQUAL(var->site.mutations_length, 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 1); CU_ASSERT_EQUAL(var->genotypes[2], 0); CU_ASSERT_EQUAL(var->genotypes[3], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 1); CU_ASSERT_EQUAL(var->site.mutations_length, 2); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], 1); CU_ASSERT_EQUAL(var->genotypes[2], 1); CU_ASSERT_EQUAL(var->genotypes[3], 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 2); CU_ASSERT_EQUAL(var->site.mutations_length, 4); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_free(&vargen); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); } static void test_single_tree_non_samples(void) { int ret = 0; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; /* Non sample internal nodes we want to generate genotypes for */ tsk_id_t samples[] = { 4, 5 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 0); CU_ASSERT_EQUAL(var->site.mutations_length, 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 1); CU_ASSERT_EQUAL(var->site.mutations_length, 2); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 2); CU_ASSERT_EQUAL(var->site.mutations_length, 4); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_free(&vargen); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 0); CU_ASSERT_EQUAL(var->site.mutations_length, 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 1); CU_ASSERT_EQUAL(var->site.mutations_length, 2); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 2); CU_ASSERT_EQUAL(var->site.mutations_length, 4); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_free(&vargen); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); } static void test_isolated_internal_node(void) { int ret = 0; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; /* Two sample nodes (0,1), plus an internal non-sample node u=2 with no edges */ const char *nodes = "1 0 -1 -1\n" "1 0 -1 -1\n" "0 1 -1 -1\n"; const char *sites = "2.0 A\n" "9.0 T\n"; tsk_id_t samples[] = { 2 }; tsk_treeseq_from_text(&ts, 10, nodes, "", NULL, sites, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_nodes(&ts), 3); CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_sites(&ts), 2); /* Default options (isolated_as_missing=True): internal node is isolated everywhere */ ret = tsk_vargen_init(&vargen, &ts, samples, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_TRUE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_TRUE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); /* Impute missing (isolated_as_missing=False): genotypes should be ancestral (0) */ ret = tsk_vargen_init(&vargen, &ts, samples, 1, NULL, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_FALSE(var->has_missing_data); CU_ASSERT_EQUAL(var->genotypes[0], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } static void test_single_tree_errors(void) { int ret; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_id_t samples[] = { 0, 3 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); samples[0] = -1; ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_vargen_free(&vargen); samples[0] = 7; ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_vargen_free(&vargen); samples[0] = 3; ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } static void test_single_tree_user_alleles_errors(void) { int ret; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; /* The maximium number of alleles is 127. We need space for one more plus the * sentinel */ const char *acct_alleles[] = { "A", "C", "G", "T", NULL }; const char *zero_allele[] = { "0", NULL }; const char *no_alleles[] = { NULL }; tsk_id_t samples[] = { 0, 3 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); /* these are 0/1 alleles */ ret = tsk_vargen_init(&vargen, &ts, samples, 2, acct_alleles, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ALLELE_NOT_FOUND); tsk_vargen_free(&vargen); /* pass just the 0 allele alleles at all */ ret = tsk_vargen_init(&vargen, &ts, samples, 2, zero_allele, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ALLELE_NOT_FOUND); tsk_vargen_free(&vargen); /* Empty allele list is an error */ ret = tsk_vargen_init(&vargen, &ts, samples, 2, no_alleles, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ZERO_ALLELES); tsk_vargen_free(&vargen); // for (j = 0; j < max_alleles; j++) { // many_alleles[j] = "0"; // } // many_alleles[128] = NULL; // ret = tsk_vargen_init(&vargen, &ts, samples, 2, many_alleles, 0); // CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TOO_MANY_ALLELES); // tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } static void test_single_tree_subsample(void) { int ret = 0; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; tsk_id_t samples[] = { 0, 3 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 0); CU_ASSERT_EQUAL(var->site.mutations_length, 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 1); CU_ASSERT_EQUAL(var->site.mutations_length, 2); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 2); CU_ASSERT_EQUAL(var->site.mutations_length, 4); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_free(&vargen); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Zero samples */ ret = tsk_vargen_init(&vargen, &ts, samples, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 0); CU_ASSERT_EQUAL(var->site.mutations_length, 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 1); CU_ASSERT_EQUAL(var->site.mutations_length, 2); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 2); CU_ASSERT_EQUAL(var->site.mutations_length, 4); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_free(&vargen); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); } static void test_single_tree_many_alleles(void) { int ret = 0; tsk_id_t ret_id; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; tsk_size_t num_alleles = 257; tsk_id_t j, k; char alleles[num_alleles]; tsk_table_collection_t tables; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_FATAL(ret == 0); tsk_treeseq_free(&ts); tsk_memset(alleles, 'X', (size_t) num_alleles); ret_id = tsk_site_table_add_row(&tables.sites, 0, "Y", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); /* Add j mutations over a single node. */ for (j = 0; j < (tsk_id_t) num_alleles; j++) { /* When j = 0 we get a parent of -1, which is the NULL_NODE */ ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, j - 1, TSK_UNKNOWN_TIME, alleles, (tsk_size_t) j, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); /* We have j + 2 alleles. So, if j >= 126, we should fail with 8bit * genotypes */ // if (j >= 126) { // CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TOO_MANY_ALLELES); // } else { CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "Y", 1); for (k = 1; k < (tsk_id_t) var->num_alleles; k++) { CU_ASSERT_EQUAL(k - 1, (tsk_id_t) var->allele_lengths[k]); CU_ASSERT_NSTRING_EQUAL(var->alleles[k], alleles, var->allele_lengths[k]); } CU_ASSERT_EQUAL(var->num_alleles, (tsk_size_t) j + 2); // } ret = tsk_vargen_free(&vargen); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); } tsk_table_collection_free(&tables); } static void test_single_tree_silent_mutations(void) { int ret = 0; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; /* Add some silent mutations */ const char *silent_ex_sites = "0.125 0\n" "0.25 0\n" "0.5 0\n" "0.75 0\n"; /* site, node, derived_state, [parent, time] */ const char *silent_ex_mutations = "0 5 0 -1\n" /* Silent mutation over mutation 1 */ "0 2 1 0\n" "1 4 1 -1\n" "1 0 0 2\n" /* Back mutation over 0 */ "1 0 0 3\n" /* Silent mutation under back mutation */ "2 0 1 -1\n" /* recurrent mutations over samples */ "2 1 1 -1\n" "2 2 1 -1\n" "2 3 1 -1\n" "3 0 0 -1\n" /* Single silent mutation at a site */ ; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, silent_ex_sites, silent_ex_mutations, NULL, NULL, 0); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->genotypes[2], 1); CU_ASSERT_EQUAL(var->genotypes[3], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 0); CU_ASSERT_EQUAL(var->site.mutations_length, 2); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 1); CU_ASSERT_EQUAL(var->genotypes[2], 0); CU_ASSERT_EQUAL(var->genotypes[3], 0); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 1); CU_ASSERT_EQUAL(var->site.mutations_length, 3); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], 1); CU_ASSERT_EQUAL(var->genotypes[2], 1); CU_ASSERT_EQUAL(var->genotypes[3], 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 2); CU_ASSERT_EQUAL(var->site.mutations_length, 4); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); CU_ASSERT_EQUAL(var->genotypes[2], 0); CU_ASSERT_EQUAL(var->genotypes[3], 0); CU_ASSERT_EQUAL(var->num_alleles, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->site.id, 3); CU_ASSERT_EQUAL(var->site.mutations_length, 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_free(&vargen); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); } static void test_multiple_variant_decode(void) { int ret = 0; tsk_size_t k; tsk_id_t s; tsk_treeseq_t ts; tsk_variant_t var; tsk_variant_t var_subset; tsk_id_t samples[] = { 0, 1, 3 }; int32_t genos[12]; int32_t genos_expected[] = { 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1 }; int32_t genos_subset[9]; int32_t genos_expected_subset[] = { 0, 0, 0, 1, 0, 0, 0, 1, 1 }; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); /* Sample subset, no sample lists */ ret = tsk_variant_init(&var_subset, &ts, samples, 3, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (s = 0; (tsk_size_t) s < tsk_treeseq_get_num_sites(&ts); s++) { ret = tsk_variant_decode(&var_subset, s, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (k = 0; k < 3; ++k) { genos_subset[k + ((tsk_size_t) s * 3)] = var_subset.genotypes[k]; } } CU_ASSERT_EQUAL( 0, memcmp(genos_subset, genos_expected_subset, sizeof(genos_expected_subset))); memset(genos_subset, 0, sizeof(genos_subset)); /* All samples with TSK_SAMPLE_LISTS, at the same time as a subset */ s = 0; ret = tsk_variant_init(&var, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (s = 0; (tsk_size_t) s < tsk_treeseq_get_num_sites(&ts); s++) { ret = tsk_variant_decode(&var, s, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (k = 0; k < 4; ++k) { genos[k + ((tsk_size_t) s * 4)] = var.genotypes[k]; } ret = tsk_variant_decode(&var_subset, s, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (k = 0; k < 3; ++k) { genos_subset[k + ((tsk_size_t) s * 3)] = var_subset.genotypes[k]; } } CU_ASSERT_EQUAL( 0, memcmp(genos_subset, genos_expected_subset, sizeof(genos_expected_subset))); CU_ASSERT_EQUAL(0, memcmp(genos, genos_expected, sizeof(genos_expected))); tsk_variant_free(&var); tsk_variant_free(&var_subset); tsk_treeseq_free(&ts); } static void test_variant_decode_errors(void) { int ret = 0; tsk_treeseq_t ts; tsk_variant_t var; tsk_id_t bad_samples[] = { 0, 1, 32 }; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); /* Bad samples */ ret = tsk_variant_init(&var, &ts, bad_samples, 3, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_variant_free(&var); /* Site out of bounds */ ret = tsk_variant_init(&var, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_variant_decode(&var, 42, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tsk_variant_free(&var); tsk_treeseq_free(&ts); } /* Checks that the data represented by the specified pair of variants exposed * by the public API is equal. */ static void assert_variants_equal(const tsk_variant_t *v1, const tsk_variant_t *v2) { tsk_size_t j; CU_ASSERT_EQUAL(v1->num_samples, v2->num_samples); CU_ASSERT_EQUAL(v1->num_alleles, v2->num_alleles); for (j = 0; j < v1->num_alleles; j++) { CU_ASSERT_EQUAL(v1->allele_lengths[j], v2->allele_lengths[j]); CU_ASSERT_EQUAL( 0, memcmp(v1->alleles[j], v2->alleles[j], (size_t) v1->allele_lengths[j])); } CU_ASSERT_EQUAL(v1->has_missing_data, v2->has_missing_data); CU_ASSERT_EQUAL(v1->num_samples, v2->num_samples); for (j = 0; j < v1->num_samples; j++) { CU_ASSERT_EQUAL(v1->samples[j], v2->samples[j]); CU_ASSERT_EQUAL(v1->genotypes[j], v2->genotypes[j]); } CU_ASSERT_EQUAL(v1->site.id, v2->site.id); CU_ASSERT_EQUAL(v1->site.position, v2->site.position); CU_ASSERT_EQUAL(v1->site.ancestral_state_length, v2->site.ancestral_state_length); CU_ASSERT_EQUAL(0, memcmp(v1->site.ancestral_state, v2->site.ancestral_state, (size_t) v1->site.ancestral_state_length)); CU_ASSERT_EQUAL(v1->site.mutations_length, v2->site.mutations_length); /* We're pointing back to the same memory for embedded pointers */ CU_ASSERT_EQUAL(v1->site.mutations, v2->site.mutations); CU_ASSERT_EQUAL(v1->site.metadata, v2->site.metadata); } static void test_variant_copy(void) { int ret = 0; tsk_size_t j; tsk_treeseq_t ts; tsk_variant_t var, var_copy; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_variant_init(&var, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < tsk_treeseq_get_num_sites(&ts); j++) { ret = tsk_variant_decode(&var, (tsk_id_t) j, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_variant_restricted_copy(&var, &var_copy); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_variant_decode(&var_copy, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_VARIANT_CANT_DECODE_COPY); assert_variants_equal(&var, &var_copy); CU_ASSERT_EQUAL( 0, memcmp(var.tree_sequence, var.tree_sequence, sizeof(*var.tree_sequence))); CU_ASSERT_EQUAL(0, memcmp(&var.tree, &var_copy.tree, sizeof(tsk_tree_t))); CU_ASSERT_EQUAL(0, memcmp(&var.site, &var_copy.site, sizeof(tsk_site_t))); CU_ASSERT_EQUAL(var_copy.traversal_stack, NULL); CU_ASSERT_EQUAL(var_copy.sample_index_map, NULL); CU_ASSERT_EQUAL(var_copy.alt_samples, NULL); CU_ASSERT_EQUAL(var_copy.alt_sample_index_map, NULL); tsk_variant_free(&var_copy); } tsk_variant_free(&var); tsk_treeseq_free(&ts); } static void test_variant_copy_long_alleles(void) { int ret = 0; const char *sites = "0.0 GGGG\n" "0.125 AAAAA\n" "0.25 CCCCCC\n" "0.5 AAAAAAA\n"; const char *mutations = "0 0 TTT -1\n" "1 1 CCCCCCC -1\n" "2 0 GGGGGGG -1\n" "2 1 AG -1\n" "2 2 TTTTTTT -1\n" "3 4 TGGGGGG -1\n" "3 0 AAA 5\n"; tsk_treeseq_t ts; tsk_variant_t var, copy, copy_of_copy; tsk_size_t j; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_variant_init(&var, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < tsk_treeseq_get_num_sites(&ts); j++) { ret = tsk_variant_decode(&var, (tsk_id_t) j, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_variant_restricted_copy(&var, ©); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_variants_equal(&var, ©); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_variant_restricted_copy(©, ©_of_copy); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_variants_equal(&var, ©_of_copy); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_variant_free(©_of_copy); tsk_variant_free(©); } tsk_variant_free(&var); tsk_treeseq_free(&ts); } static void test_variant_copy_memory_management(void) { int ret = 0; tsk_size_t j; tsk_treeseq_t ts; tsk_variant_t *var; tsk_variant_t copy, copy_of_copy; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); for (j = 0; j < tsk_treeseq_get_num_sites(&ts); j++) { var = tsk_malloc(sizeof(*var)); CU_ASSERT_FATAL(var != NULL); ret = tsk_variant_init(var, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_variant_decode(var, (tsk_id_t) j, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_variant_restricted_copy(var, ©); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_variants_equal(var, ©); /* Free var to make sure we're not pointing to any of the original memory. */ tsk_variant_free(var); free(var); ret = tsk_variant_restricted_copy(©, ©_of_copy); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_variants_equal(©, ©_of_copy); ret = tsk_variant_decode(©, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_VARIANT_CANT_DECODE_COPY); ret = tsk_variant_decode(©_of_copy, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_VARIANT_CANT_DECODE_COPY); tsk_variant_free(©); tsk_variant_free(©_of_copy); } tsk_treeseq_free(&ts); } static void build_balanced_three_example_align(tsk_treeseq_t *ts) { const char *nodes = "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" "0 1 0 -1\n" "0 2 0 -1\n"; const char *edges = "0 10 3 1,2\n" "0 10 4 0,3\n"; const char *sites = "2 A\n" "9 T\n"; const char *mutations = "0 0 G\n" "1 3 C\n"; tsk_treeseq_from_text(ts, 10, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); } static void test_alignments_basic_default(void) { int ret = 0; tsk_treeseq_t ts; const char *ref = "NNNNNNNNNN"; const tsk_id_t *samples; tsk_size_t n, L; char *buf; build_balanced_three_example_align(&ts); samples = tsk_treeseq_get_samples(&ts); n = tsk_treeseq_get_num_samples(&ts); L = 10; buf = tsk_malloc(n * L); CU_ASSERT_PTR_NOT_NULL_FATAL(buf); ret = tsk_treeseq_decode_alignments( &ts, ref, (tsk_size_t) strlen(ref), samples, n, 0, 10, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NSTRING_EQUAL(buf + 0 * L, "NNGNNNNNNT", L); CU_ASSERT_NSTRING_EQUAL(buf + 1 * L, "NNANNNNNNC", L); CU_ASSERT_NSTRING_EQUAL(buf + 2 * L, "NNANNNNNNC", L); tsk_safe_free(buf); tsk_treeseq_free(&ts); } static void test_alignments_reference_sequence(void) { int ret = 0; tsk_treeseq_t ts; const char *ref = "0123456789"; const tsk_id_t *samples; tsk_size_t n, L = 10; char *buf = NULL; build_balanced_three_example_align(&ts); samples = tsk_treeseq_get_samples(&ts); n = tsk_treeseq_get_num_samples(&ts); buf = tsk_malloc(n * L); CU_ASSERT_PTR_NOT_NULL_FATAL(buf); ret = tsk_treeseq_decode_alignments( &ts, ref, (tsk_size_t) strlen(ref), samples, n, 0, 10, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NSTRING_EQUAL(buf + 0 * L, "01G345678T", L); CU_ASSERT_NSTRING_EQUAL(buf + 1 * L, "01A345678C", L); CU_ASSERT_NSTRING_EQUAL(buf + 2 * L, "01A345678C", L); tsk_safe_free(buf); tsk_treeseq_free(&ts); } static void test_alignments_partial_isolation(void) { int ret = 0; const char *nodes = "0 1 0 -1\n" /* parent */ "1 0 0 -1\n"; /* child sample */ const char *edges = "3 7 0 1\n"; const char *sites = "5 A\n"; const char *mutations = "0 1 G\n"; tsk_treeseq_t ts; const char *ref = "0123456789"; tsk_id_t node = 1; char buf[10]; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_decode_alignments(&ts, ref, 10, &node, 1, 0, 10, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NSTRING_EQUAL(buf, "NNN34G6NNN", 10); ret = tsk_treeseq_decode_alignments(&ts, ref, 10, &node, 1, 2, 8, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NSTRING_EQUAL(buf, "N34G6N", 6); tsk_treeseq_free(&ts); } static void test_alignments_return_code_truncated_interval(void) { int ret = 0; const char *nodes = "1 0 0 -1\n" "1 0 0 -1\n" "0 1 0 -1\n"; /* Tree over [0,5): samples 0 and 1 under root 2. * Tree over [5,10): only sample 1 under root 2 (sample 0 isolated). */ const char *edges = "0 5 2 0\n" "0 10 2 1\n"; tsk_treeseq_t ts; const tsk_id_t *samples; tsk_size_t n; char buf[10]; const char *ref = "NNNNNNNNNN"; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); samples = tsk_treeseq_get_samples(&ts); n = tsk_treeseq_get_num_samples(&ts); ret = tsk_treeseq_decode_alignments(&ts, ref, 10, samples, n, 0, 5, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NSTRING_EQUAL(buf + 0 * 5, "NNNNN", 5); CU_ASSERT_NSTRING_EQUAL(buf + 1 * 5, "NNNNN", 5); tsk_treeseq_free(&ts); } static void test_alignments_invalid_allele_length(void) { int ret = 0; const char *nodes = "1 0 0 -1\n"; const char *edges = ""; const char *sites = "2 AC\n"; tsk_treeseq_t ts; tsk_id_t node = 0; char buf[5]; const char *ref = "NNNNN"; tsk_treeseq_from_text(&ts, 5, nodes, edges, NULL, sites, NULL, NULL, NULL, 0); ret = tsk_treeseq_decode_alignments(&ts, ref, 5, &node, 1, 0, 5, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_ALLELE_LENGTH); tsk_treeseq_free(&ts); } static void test_alignments_bad_reference_length(void) { int ret = 0; const char *nodes = "1 0 0 -1\n"; const char *edges = ""; tsk_treeseq_t ts; tsk_id_t node = 0; char buf[5]; const char *ref = "NNNNN"; tsk_treeseq_from_text(&ts, 5, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_decode_alignments(&ts, ref, 4, &node, 1, 0, 5, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_treeseq_free(&ts); } static void test_alignments_non_integer_bounds(void) { int ret = 0; const char *nodes = "1 0 0 -1\n"; const char *edges = ""; tsk_treeseq_t ts; tsk_id_t node = 0; char buf[5]; const char *ref = "NNNNN"; tsk_treeseq_from_text(&ts, 5, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_decode_alignments(&ts, ref, 5, &node, 1, 0.5, 5, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_treeseq_free(&ts); } static void test_alignments_discrete_genome_required(void) { int ret = 0; const char *nodes = "1 0 0 -1\n"; const char *edges = ""; const char *sites = "0.5 A\n"; tsk_treeseq_t ts; tsk_id_t node = 0; char buf[5]; const char *ref = "NNNNN"; tsk_treeseq_from_text(&ts, 5, nodes, edges, NULL, sites, NULL, NULL, NULL, 0); ret = tsk_treeseq_decode_alignments(&ts, ref, 5, &node, 1, 0, 5, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_treeseq_free(&ts); } static void test_alignments_null_reference(void) { int ret = 0; tsk_treeseq_t ts; const tsk_id_t *samples; tsk_size_t n; char buf[10]; build_balanced_three_example_align(&ts); samples = tsk_treeseq_get_samples(&ts); n = tsk_treeseq_get_num_samples(&ts); ret = tsk_treeseq_decode_alignments(&ts, NULL, 10, samples, n, 0, 10, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_treeseq_free(&ts); } static void test_alignments_null_nodes_or_buf(void) { int ret = 0; tsk_treeseq_t ts; const char *ref = "NNNNNNNNNN"; const tsk_id_t *samples; tsk_size_t n; char buf[30]; build_balanced_three_example_align(&ts); samples = tsk_treeseq_get_samples(&ts); n = tsk_treeseq_get_num_samples(&ts); ret = tsk_treeseq_decode_alignments(&ts, ref, 10, NULL, n, 0, 10, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_treeseq_decode_alignments(&ts, ref, 10, samples, n, 0, 10, 'N', NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_treeseq_free(&ts); } static void test_alignments_node_out_of_bounds(void) { int ret = 0; tsk_treeseq_t ts; const char *ref = "NNNNNNNNNN"; tsk_id_t bad_node; char buf[10]; build_balanced_three_example_align(&ts); bad_node = (tsk_id_t) tsk_treeseq_get_num_nodes(&ts); ret = tsk_treeseq_decode_alignments(&ts, ref, 10, &bad_node, 1, 0, 10, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); } static void test_alignments_isolated_as_not_missing(void) { int ret = 0; const char *nodes = "0 1 0 -1\n" /* parent */ "1 0 0 -1\n"; /* child sample */ const char *edges = "3 7 0 1\n"; const char *sites = "5 A\n"; const char *mutations = "0 1 G\n"; tsk_treeseq_t ts; const char *ref = "0123456789"; tsk_id_t node = 1; char buf[10]; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_decode_alignments( &ts, ref, 10, &node, 1, 0, 10, 'N', buf, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NSTRING_EQUAL(buf, "01234G6789", 10); ret = tsk_treeseq_decode_alignments( &ts, ref, 10, &node, 1, 2, 8, 'N', buf, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NSTRING_EQUAL(buf, "234G67", 6); tsk_treeseq_free(&ts); } static void test_alignments_internal_node_non_sample(void) { int ret = 0; tsk_treeseq_t ts; const char *ref = "NNNNNNNNNN"; tsk_id_t node = 3; /* internal node */ char buf[10]; build_balanced_three_example_align(&ts); ret = tsk_treeseq_decode_alignments(&ts, ref, 10, &node, 1, 0, 10, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NSTRING_EQUAL(buf, "NNANNNNNNC", 10); tsk_treeseq_free(&ts); } static void test_alignments_missing_char_collision(void) { int ret = 0; const char *nodes = "1 0 0 -1\n"; const char *edges = ""; const char *sites = "2 A\n"; const char *mutations = "0 0 Q\n"; /* allele equals missing char */ tsk_treeseq_t ts; tsk_id_t node = 0; char buf[5]; const char *ref = "NNNNN"; tsk_treeseq_from_text(&ts, 5, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_decode_alignments(&ts, ref, 5, &node, 1, 0, 5, 'Q', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MISSING_CHAR_COLLISION); tsk_treeseq_free(&ts); } static void test_alignments_zero_nodes_ok(void) { int ret = 0; tsk_treeseq_t ts; const char *ref = "NNNNNNNNNN"; build_balanced_three_example_align(&ts); ret = tsk_treeseq_decode_alignments(&ts, ref, 10, NULL, 0, 0, 10, 'N', NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); } static void test_alignments_bad_bounds_cases(void) { int ret = 0; tsk_treeseq_t ts; const char *ref = "NNNNNNNNNN"; tsk_id_t node = 0; char buf[1]; build_balanced_three_example_align(&ts); /* left == right invalid */ ret = tsk_treeseq_decode_alignments(&ts, ref, 10, &node, 1, 5, 5, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* left negative */ ret = tsk_treeseq_decode_alignments(&ts, ref, 10, &node, 1, -1, 5, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_treeseq_free(&ts); } static void test_alignments_order_preserved(void) { int ret = 0; tsk_treeseq_t ts; const char *ref = "NNNNNNNNNN"; tsk_id_t nodes_arr[3]; char buf[30]; tsk_size_t L = 10; build_balanced_three_example_align(&ts); nodes_arr[0] = 2; nodes_arr[1] = 0; nodes_arr[2] = 1; ret = tsk_treeseq_decode_alignments(&ts, ref, 10, nodes_arr, 3, 0, 10, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NSTRING_EQUAL(buf + 0 * L, "NNANNNNNNC", L); CU_ASSERT_NSTRING_EQUAL(buf + 1 * L, "NNGNNNNNNT", L); CU_ASSERT_NSTRING_EQUAL(buf + 2 * L, "NNANNNNNNC", L); tsk_treeseq_free(&ts); } static void test_alignments_missing_char_custom(void) { int ret = 0; const char *nodes = "0 1 0 -1\n" /* parent */ "1 0 0 -1\n"; /* child sample */ const char *edges = "3 7 0 1\n"; const char *sites = "5 A\n"; const char *mutations = "0 1 G\n"; tsk_treeseq_t ts; const char *ref = "0123456789"; tsk_id_t node = 1; char buf[10]; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_decode_alignments(&ts, ref, 10, &node, 1, 0, 10, 'Q', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_NSTRING_EQUAL(buf, "QQQ34G6QQQ", 10); tsk_treeseq_free(&ts); } static void test_alignments_embedded_null_reference(void) { int ret = 0; tsk_treeseq_t ts; char ref[10] = { '0', '1', '2', '3', '\0', '5', '6', '7', '8', '9' }; const tsk_id_t *samples; tsk_size_t n, L = 10; char *buf = NULL; char exp0[10] = { '0', '1', 'G', '3', '\0', '5', '6', '7', '8', 'T' }; char exp1[10] = { '0', '1', 'A', '3', '\0', '5', '6', '7', '8', 'C' }; char exp2[10] = { '0', '1', 'A', '3', '\0', '5', '6', '7', '8', 'C' }; build_balanced_three_example_align(&ts); samples = tsk_treeseq_get_samples(&ts); n = tsk_treeseq_get_num_samples(&ts); buf = tsk_malloc(n * L); CU_ASSERT_PTR_NOT_NULL_FATAL(buf); ret = tsk_treeseq_decode_alignments(&ts, ref, 10, samples, n, 0, 10, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(0, memcmp(buf + 0 * L, exp0, (size_t) L)); CU_ASSERT_EQUAL(0, memcmp(buf + 1 * L, exp1, (size_t) L)); CU_ASSERT_EQUAL(0, memcmp(buf + 2 * L, exp2, (size_t) L)); tsk_safe_free(buf); tsk_treeseq_free(&ts); } static void test_alignments_growing_allele_buffer(void) { /* Verify we handle sites with increasing allele counts without per-site realloc * churn. */ int ret = 0; /* Two samples (0,1) with root 2 over [0,3). */ const char *nodes = "1 0 0 -1\n" "1 0 0 -1\n" "0 1 0 -1\n"; const char *edges = "0 3 2 0\n" "0 3 2 1\n"; /* Sites: pos 1 ancestral A; pos 2 ancestral A. */ const char *sites = "1 A\n" "2 A\n"; /* Mutations: at site 0 (pos 1) node 0 -> G (2 alleles total). * at site 1 (pos 2) node 0 -> C and node 1 -> T (3 alleles total). */ const char *mutations = "0 0 G\n" "1 0 C\n" "1 1 T\n"; tsk_treeseq_t ts; const char *ref = "NNN"; const tsk_id_t *samples; tsk_size_t n, L = 3; char *buf = NULL; tsk_treeseq_from_text(&ts, 3, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); samples = tsk_treeseq_get_samples(&ts); n = tsk_treeseq_get_num_samples(&ts); buf = tsk_malloc(n * L); CU_ASSERT_PTR_NOT_NULL_FATAL(buf); ret = tsk_treeseq_decode_alignments(&ts, ref, 3, samples, n, 0, 3, 'N', buf, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Expected: sample 0 -> NGC; sample 1 -> NAT */ CU_ASSERT_NSTRING_EQUAL(buf + 0 * L, "NGC", L); CU_ASSERT_NSTRING_EQUAL(buf + 1 * L, "NAT", L); tsk_safe_free(buf); tsk_treeseq_free(&ts); } int main(int argc, char **argv) { CU_TestInfo tests[] = { { "test_simplest_missing_data", test_simplest_missing_data }, { "test_simplest_missing_data_user_alleles", test_simplest_missing_data_user_alleles }, { "test_simplest_missing_data_mutations", test_simplest_missing_data_mutations }, { "test_simplest_missing_data_mutations_all_samples", test_simplest_missing_data_mutations_all_samples }, { "test_single_tree_user_alleles", test_single_tree_user_alleles }, { "test_single_tree_char_alphabet", test_single_tree_char_alphabet }, { "test_single_tree_binary_alphabet", test_single_tree_binary_alphabet }, { "test_single_tree_non_samples", test_single_tree_non_samples }, { "test_isolated_internal_node", test_isolated_internal_node }, { "test_single_tree_errors", test_single_tree_errors }, { "test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors }, { "test_single_tree_subsample", test_single_tree_subsample }, { "test_single_tree_many_alleles", test_single_tree_many_alleles }, { "test_single_tree_silent_mutations", test_single_tree_silent_mutations }, { "test_multiple_variant_decode", test_multiple_variant_decode }, { "test_variant_decode_errors", test_variant_decode_errors }, { "test_variant_copy", test_variant_copy }, { "test_variant_copy_long_alleles", test_variant_copy_long_alleles }, { "test_variant_copy_memory_management", test_variant_copy_memory_management }, { "test_alignments_basic_default", test_alignments_basic_default }, { "test_alignments_reference_sequence", test_alignments_reference_sequence }, { "test_alignments_partial_isolation", test_alignments_partial_isolation }, { "test_alignments_return_code_truncated_interval", test_alignments_return_code_truncated_interval }, { "test_alignments_isolated_as_not_missing", test_alignments_isolated_as_not_missing }, { "test_alignments_internal_node_non_sample", test_alignments_internal_node_non_sample }, { "test_alignments_invalid_allele_length", test_alignments_invalid_allele_length }, { "test_alignments_bad_reference_length", test_alignments_bad_reference_length }, { "test_alignments_non_integer_bounds", test_alignments_non_integer_bounds }, { "test_alignments_discrete_genome_required", test_alignments_discrete_genome_required }, { "test_alignments_null_reference", test_alignments_null_reference }, { "test_alignments_null_nodes_or_buf", test_alignments_null_nodes_or_buf }, { "test_alignments_node_out_of_bounds", test_alignments_node_out_of_bounds }, { "test_alignments_missing_char_collision", test_alignments_missing_char_collision }, { "test_alignments_zero_nodes_ok", test_alignments_zero_nodes_ok }, { "test_alignments_bad_bounds_cases", test_alignments_bad_bounds_cases }, { "test_alignments_order_preserved", test_alignments_order_preserved }, { "test_alignments_missing_char_custom", test_alignments_missing_char_custom }, { "test_alignments_embedded_null_reference", test_alignments_embedded_null_reference }, { "test_alignments_growing_allele_buffer", test_alignments_growing_allele_buffer }, { NULL, NULL }, }; return test_main(tests, argc, argv); } ================================================ FILE: c/tests/test_haplotype_matching.c ================================================ /* * MIT License * * Copyright (c) 2019-2023 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "testlib.h" #include #include #include static void test_single_tree_missing_alleles(void) { int ret = 0; tsk_treeseq_t ts; tsk_ls_hmm_t ls_hmm; tsk_compressed_matrix_t forward; tsk_viterbi_matrix_t viterbi; double rho[] = { 0, 0.25, 0.25 }; double mu[] = { 0.125, 0.125, 0.125 }; int32_t h[] = { 0, 0, 0, 0 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_ls_hmm_init(&ls_hmm, &ts, rho, mu, TSK_ALLELES_ACGT); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_forward(&ls_hmm, h, &forward, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ALLELE_NOT_FOUND); ret = tsk_ls_hmm_viterbi(&ls_hmm, h, &viterbi, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ALLELE_NOT_FOUND); tsk_ls_hmm_free(&ls_hmm); tsk_compressed_matrix_free(&forward); tsk_viterbi_matrix_free(&viterbi); tsk_treeseq_free(&ts); } static void test_single_tree_exact_match(void) { int ret = 0; tsk_treeseq_t ts; tsk_ls_hmm_t ls_hmm; tsk_compressed_matrix_t forward; tsk_viterbi_matrix_t viterbi; double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; int32_t h[] = { 1, 1, 1 }; tsk_id_t path[3]; double decoded_compressed_matrix[12]; unsigned int precision; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_ls_hmm_init(&ls_hmm, &ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_forward(&ls_hmm, h, &forward, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_compressed_matrix_print_state(&forward, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); ret = tsk_compressed_matrix_decode(&forward, decoded_compressed_matrix); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_viterbi(&ls_hmm, h, &viterbi, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_viterbi_matrix_print_state(&viterbi, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); ret = tsk_viterbi_matrix_traceback(&viterbi, path, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(path[0], 2); CU_ASSERT_EQUAL(path[1], 1); CU_ASSERT_EQUAL(path[2], 1); /* Should get the same answer at lower precision */ for (precision = 1; precision < 24; precision++) { ret = tsk_ls_hmm_set_precision(&ls_hmm, precision); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_viterbi(&ls_hmm, h, &viterbi, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_viterbi_matrix_print_state(&viterbi, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); ret = tsk_viterbi_matrix_traceback(&viterbi, path, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(path[0], 2); CU_ASSERT_EQUAL(path[1], 1); CU_ASSERT_EQUAL(path[2], 1); } tsk_ls_hmm_free(&ls_hmm); tsk_compressed_matrix_free(&forward); tsk_viterbi_matrix_free(&viterbi); tsk_treeseq_free(&ts); } static void test_single_tree_missing_haplotype_data(void) { int ret = 0; tsk_treeseq_t ts; tsk_ls_hmm_t ls_hmm; tsk_compressed_matrix_t forward; tsk_viterbi_matrix_t viterbi; double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; int32_t h[] = { 1, TSK_MISSING_DATA, 1 }; tsk_id_t path[3]; double decoded_compressed_matrix[12]; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_ls_hmm_init(&ls_hmm, &ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_forward(&ls_hmm, h, &forward, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_compressed_matrix_print_state(&forward, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); ret = tsk_compressed_matrix_decode(&forward, decoded_compressed_matrix); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_viterbi(&ls_hmm, h, &viterbi, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_viterbi_matrix_print_state(&viterbi, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); ret = tsk_viterbi_matrix_traceback(&viterbi, path, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(path[0], 2); CU_ASSERT_EQUAL(path[1], 2); CU_ASSERT_EQUAL(path[2], 2); tsk_ls_hmm_free(&ls_hmm); tsk_compressed_matrix_free(&forward); tsk_viterbi_matrix_free(&viterbi); tsk_treeseq_free(&ts); } static void test_single_tree_match_impossible(void) { int ret = 0; tsk_treeseq_t ts; tsk_ls_hmm_t ls_hmm; tsk_compressed_matrix_t forward; tsk_compressed_matrix_t backward; tsk_viterbi_matrix_t viterbi; double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; /* This haplotype can't happen with a mutation rate of 0 */ int32_t h[] = { 0, 0, 0 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_ls_hmm_init(&ls_hmm, &ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_forward(&ls_hmm, h, &forward, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MATCH_IMPOSSIBLE); tsk_compressed_matrix_print_state(&forward, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); ret = tsk_ls_hmm_viterbi(&ls_hmm, h, &viterbi, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MATCH_IMPOSSIBLE); tsk_viterbi_matrix_print_state(&viterbi, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); ret = tsk_ls_hmm_backward(&ls_hmm, h, forward.normalisation_factor, &backward, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MATCH_IMPOSSIBLE); tsk_compressed_matrix_print_state(&backward, _devnull); /* tsk_compressed_matrix_print_state(&forward, stdout); */ /* tsk_compressed_matrix_print_state(&backward, stdout); */ tsk_ls_hmm_print_state(&ls_hmm, _devnull); tsk_ls_hmm_free(&ls_hmm); tsk_compressed_matrix_free(&forward); tsk_compressed_matrix_free(&backward); tsk_viterbi_matrix_free(&viterbi); tsk_treeseq_free(&ts); } static void test_single_tree_errors(void) { int ret = 0; tsk_treeseq_t ts; tsk_ls_hmm_t ls_hmm; tsk_compressed_matrix_t forward; tsk_viterbi_matrix_t viterbi; tsk_value_transition_t T[1]; double decoded[3][4]; double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; int32_t h[] = { 0, 0, 0 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_viterbi_matrix_init(&viterbi, &ts, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_compressed_matrix_init(&forward, &ts, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_init(&ls_hmm, &ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); viterbi.matrix.tree_sequence = NULL; ret = tsk_ls_hmm_viterbi(&ls_hmm, h, &viterbi, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); viterbi.matrix.tree_sequence = &ts; forward.tree_sequence = NULL; ret = tsk_ls_hmm_forward(&ls_hmm, h, &forward, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); forward.tree_sequence = &ts; ret = tsk_compressed_matrix_store_site(&forward, 3, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); ret = tsk_compressed_matrix_store_site(&forward, 4, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); /* FIXME disabling this tests for now because we filter out negative * nodes when storing now, to accomodate some oddness in the initial * conditions of the backward matrix. */ /* T[0].tree_node = -1; */ /* T[0].value = 0; */ /* ret = tsk_compressed_matrix_store_site(&forward, 0, 1, 1, T); */ /* CU_ASSERT_EQUAL_FATAL(ret, 0); */ /* ret = tsk_compressed_matrix_decode(&forward, (double *) decoded); */ /* CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); */ T[0].tree_node = 7; T[0].value = 0; ret = tsk_compressed_matrix_store_site(&forward, 0, 1, 1, T); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_compressed_matrix_decode(&forward, (double *) decoded); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_ls_hmm_free(&ls_hmm); tsk_compressed_matrix_free(&forward); tsk_viterbi_matrix_free(&viterbi); tsk_treeseq_free(&ts); } static void test_single_tree_compressed_matrix(void) { int ret = 0; tsk_treeseq_t ts; tsk_compressed_matrix_t matrix; tsk_ls_hmm_t ls_hmm; tsk_size_t max_transitions = 1024; tsk_value_transition_t T[max_transitions]; double decoded[3][4]; int j; double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0.1, 0.1, 0.1 }; int32_t h[] = { 0, 0, 0 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_compressed_matrix_init(&matrix, &ts, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_compressed_matrix_print_state(&matrix, _devnull); T[0].tree_node = 6; T[0].value = 0; for (j = 0; j < 3; j++) { T[1].tree_node = j; T[1].value = 1; ret = tsk_compressed_matrix_store_site(&matrix, j, 1.0, 2, T); CU_ASSERT_EQUAL_FATAL(ret, 0); } tsk_compressed_matrix_print_state(&matrix, _devnull); ret = tsk_compressed_matrix_decode(&matrix, (double *) decoded); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(decoded[0][0], 1.0); CU_ASSERT_EQUAL(decoded[0][1], 0.0); CU_ASSERT_EQUAL(decoded[0][2], 0.0); CU_ASSERT_EQUAL(decoded[1][0], 0.0); CU_ASSERT_EQUAL(decoded[1][1], 1.0); CU_ASSERT_EQUAL(decoded[1][2], 0.0); CU_ASSERT_EQUAL(decoded[2][0], 0.0); CU_ASSERT_EQUAL(decoded[2][1], 0.0); CU_ASSERT_EQUAL(decoded[2][2], 1.0); /* Cleared matrix should be zero everywhere */ tsk_compressed_matrix_clear(&matrix); ret = tsk_compressed_matrix_decode(&matrix, (double *) decoded); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < 3; j++) { CU_ASSERT_EQUAL(decoded[j][0], 0.0); CU_ASSERT_EQUAL(decoded[j][1], 0.0); CU_ASSERT_EQUAL(decoded[j][2], 0.0); } ret = tsk_ls_hmm_init(&ls_hmm, &ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_forward(&ls_hmm, h, &matrix, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_compressed_matrix_print_state(&matrix, _devnull); ret = tsk_compressed_matrix_decode(&matrix, (double *) decoded); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_compressed_matrix_free(&matrix); tsk_ls_hmm_free(&ls_hmm); tsk_treeseq_free(&ts); } static void test_single_tree_viterbi_matrix(void) { int ret = 0; tsk_treeseq_t ts; tsk_viterbi_matrix_t viterbi; tsk_ls_hmm_t ls_hmm; double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; int32_t h[] = { 1, 1, 1 }; tsk_id_t path[3]; tsk_value_transition_t T[2]; int j; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_viterbi_matrix_init(&viterbi, &ts, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_viterbi_matrix_print_state(&viterbi, _devnull); ret = tsk_viterbi_matrix_traceback(&viterbi, path, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NULL_VITERBI_MATRIX); T[0].tree_node = 6; T[0].value = 0; T[1].tree_node = 1; T[1].value = 1; for (j = 0; j < 3; j++) { ret = tsk_compressed_matrix_store_site(&viterbi.matrix, j, 1.0, 2, T); CU_ASSERT_EQUAL_FATAL(ret, 0); /* We need to have one record per site, so we put in a record * at the root saying we don't need to recombine */ ret = tsk_viterbi_matrix_add_recombination_required(&viterbi, j, 6, false); CU_ASSERT_EQUAL_FATAL(ret, 0); } ret = tsk_viterbi_matrix_traceback(&viterbi, path, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(path[0], 1); CU_ASSERT_EQUAL_FATAL(path[1], 1); CU_ASSERT_EQUAL_FATAL(path[2], 1); ret = tsk_ls_hmm_init(&ls_hmm, &ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_viterbi(&ls_hmm, h, &viterbi, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_viterbi_matrix_print_state(&viterbi, _devnull); ret = tsk_viterbi_matrix_traceback(&viterbi, path, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_viterbi_matrix_clear(&viterbi); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_viterbi_matrix_traceback(&viterbi, path, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NULL_VITERBI_MATRIX); tsk_viterbi_matrix_free(&viterbi); ret = tsk_viterbi_matrix_init(&viterbi, &ts, 1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Make sure we hit the realloc case for recombination records */ for (j = 0; j < 100; j++) { ret = tsk_viterbi_matrix_add_recombination_required(&viterbi, 0, 6, false); CU_ASSERT_EQUAL_FATAL(ret, 0); } tsk_viterbi_matrix_print_state(&viterbi, _devnull); tsk_viterbi_matrix_free(&viterbi); tsk_ls_hmm_free(&ls_hmm); tsk_treeseq_free(&ts); } static void test_multi_tree_exact_match(void) { int ret = 0; tsk_treeseq_t ts; tsk_ls_hmm_t ls_hmm; tsk_compressed_matrix_t forward, backward; tsk_viterbi_matrix_t viterbi; double rho[] = { 0.0, 0.25, 0.25 }; double mu[] = { 0, 0, 0 }; int32_t h[] = { 1, 1, 1 }; tsk_id_t path[3]; double decoded_compressed_matrix[12]; unsigned int precision; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_ls_hmm_init(&ls_hmm, &ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_forward(&ls_hmm, h, &forward, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_ls_hmm_print_state(&ls_hmm, _devnull); tsk_compressed_matrix_print_state(&forward, _devnull); ret = tsk_compressed_matrix_decode(&forward, decoded_compressed_matrix); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_backward(&ls_hmm, h, forward.normalisation_factor, &backward, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_ls_hmm_print_state(&ls_hmm, _devnull); tsk_compressed_matrix_print_state(&backward, _devnull); ret = tsk_compressed_matrix_decode(&backward, decoded_compressed_matrix); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_viterbi(&ls_hmm, h, &viterbi, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_viterbi_matrix_print_state(&viterbi, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); ret = tsk_viterbi_matrix_traceback(&viterbi, path, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(path[0], 2); CU_ASSERT_EQUAL(path[1], 0); CU_ASSERT_EQUAL(path[2], 1); /* Should get the same answer at lower precision */ for (precision = 4; precision < 24; precision++) { ret = tsk_ls_hmm_set_precision(&ls_hmm, precision); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_viterbi(&ls_hmm, h, &viterbi, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_viterbi_matrix_print_state(&viterbi, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); ret = tsk_viterbi_matrix_traceback(&viterbi, path, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(path[0], 2); CU_ASSERT_EQUAL(path[1], 0); CU_ASSERT_EQUAL(path[2], 1); } tsk_ls_hmm_free(&ls_hmm); tsk_compressed_matrix_free(&forward); tsk_compressed_matrix_free(&backward); tsk_viterbi_matrix_free(&viterbi); tsk_treeseq_free(&ts); } static void test_multi_tree_errors(void) { int ret = 0; tsk_treeseq_t ts; tsk_compressed_matrix_t forward; tsk_value_transition_t T[1]; double decoded[3][4]; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_compressed_matrix_init(&forward, &ts, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* We want a tree node that is not in the first tree */ T[0].tree_node = 7; T[0].value = 0; ret = tsk_compressed_matrix_store_site(&forward, 0, 1, 1, T); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_compressed_matrix_decode(&forward, (double *) decoded); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_COMPRESSED_MATRIX_NODE); tsk_compressed_matrix_free(&forward); tsk_treeseq_free(&ts); } static void test_caterpillar_tree_many_values(void) { int ret = 0; tsk_ls_hmm_t ls_hmm; tsk_compressed_matrix_t matrix; double rho[] = { 0.1, 0.1, 0.1, 0.1, 0.1 }; double mu[] = { 0.0, 0.0, 0.0, 0.0, 0.0 }; int32_t h[] = { 0, 0, 0, 0, 0 }; tsk_size_t n[] = { 8, 16, 32, 64, }; tsk_treeseq_t *ts; tsk_size_t j; for (j = 0; j < sizeof(n) / sizeof(*n); j++) { ts = caterpillar_tree(n[j], 5, n[j] - 2); ret = tsk_ls_hmm_init(&ls_hmm, ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_compressed_matrix_init(&matrix, ts, 1 << 10, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ls_hmm_forward(&ls_hmm, h, &matrix, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_compressed_matrix_print_state(&matrix, _devnull); tsk_ls_hmm_print_state(&ls_hmm, _devnull); tsk_ls_hmm_free(&ls_hmm); tsk_compressed_matrix_free(&matrix); tsk_treeseq_free(ts); free(ts); } j = 40; ts = caterpillar_tree(j, 5, j - 2); ret = tsk_ls_hmm_init(&ls_hmm, ts, rho, mu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_compressed_matrix_init(&matrix, ts, 1 << 20, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Short circuit this value so we can run the test */ ls_hmm.max_parsimony_words = 0; ret = tsk_ls_hmm_forward(&ls_hmm, h, &matrix, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TOO_MANY_VALUES); tsk_ls_hmm_free(&ls_hmm); tsk_compressed_matrix_free(&matrix); tsk_treeseq_free(ts); free(ts); } int main(int argc, char **argv) { CU_TestInfo tests[] = { { "test_single_tree_missing_alleles", test_single_tree_missing_alleles }, { "test_single_tree_exact_match", test_single_tree_exact_match }, { "test_single_tree_missing_haplotype_data", test_single_tree_missing_haplotype_data }, { "test_single_tree_match_impossible", test_single_tree_match_impossible }, { "test_single_tree_errors", test_single_tree_errors }, { "test_single_tree_compressed_matrix", test_single_tree_compressed_matrix }, { "test_single_tree_viterbi_matrix", test_single_tree_viterbi_matrix }, { "test_multi_tree_exact_match", test_multi_tree_exact_match }, { "test_multi_tree_errors", test_multi_tree_errors }, { "test_caterpillar_tree_many_values", test_caterpillar_tree_many_values }, { NULL, NULL }, }; return test_main(tests, argc, argv); } ================================================ FILE: c/tests/test_minimal_cpp.cpp ================================================ /* * MIT License * * Copyright (c) 2019-2024 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* Minimal tests to make sure that tskit at least compiles and links * in a simple C++ program */ #include #include #include #include #include #include #include using namespace std; void test_kas_strerror() { std::cout << "test_kas_strerror" << endl; std::ostringstream o; o << kas_strerror(KAS_ERR_NO_MEMORY); assert(std::string("Out of memory").compare(o.str()) == 0); } void test_strerror() { std::cout << "test_strerror" << endl; std::ostringstream o; o << tsk_strerror(TSK_ERR_NO_MEMORY); assert(std::string("Out of memory. (TSK_ERR_NO_MEMORY)").compare(o.str()) == 0); } void test_load_error() { std::cout << "test_open_error" << endl; tsk_treeseq_t ts; int ret = tsk_treeseq_load(&ts, "no such file", 0); assert(ret == TSK_ERR_IO); tsk_treeseq_free(&ts); } void test_table_basics() { std::cout << "test_table_basics" << endl; tsk_table_collection_t tables; int ret = tsk_table_collection_init(&tables, 0); assert(ret == 0); ret = tsk_node_table_add_row(&tables.nodes, 0, 1.0, TSK_NULL, TSK_NULL, NULL, 0); assert(ret == 0); ret = tsk_node_table_add_row(&tables.nodes, 0, 2.0, TSK_NULL, TSK_NULL, NULL, 0); assert(ret == 1); assert(tables.nodes.num_rows == 2); tsk_table_collection_free(&tables); } /* A definition of sort_edges that uses C++ std::sort and inlining of the * comparison function to achieve significantly better performance than * the builtin method in tskit. */ int cpp_sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start) { struct _edge { double left, right; tsk_id_t parent, child; _edge(double l, double r, tsk_id_t p, tsk_id_t c) : left{ l }, right{ r }, parent{ p }, child{ c } { } }; tsk_edge_table_t *edges = &sorter->tables->edges; const double *node_time = sorter->tables->nodes.time; std::vector<_edge> sorted_edges; size_t num_edges = edges->num_rows; size_t j; /* This is the comparison function. We cannot define an * operator < for _edge because we need to bind the node times * so we have to use a functional method. This is a copy of the cmp * from fwdpp. Only difference is the final time comparison * (fwdpp table times go forwards). */ const auto cmp = [&node_time](const _edge &lhs, const _edge &rhs) { auto tl = node_time[lhs.parent]; auto tr = node_time[rhs.parent]; if (tl == tr) { if (lhs.parent == rhs.parent) { if (lhs.child == rhs.child) { return lhs.left < rhs.left; } return lhs.child < rhs.child; } return lhs.parent < rhs.parent; } return tl < tr; }; assert(start == 0); /* Let's not bother with metadata */ assert(edges->metadata_length == 0); sorted_edges.reserve(num_edges); for (j = 0; j < num_edges; j++) { sorted_edges.emplace_back( edges->left[j], edges->right[j], edges->parent[j], edges->child[j]); } std::sort(begin(sorted_edges), end(sorted_edges), cmp); for (j = 0; j < num_edges; j++) { edges->left[j] = sorted_edges[j].left; edges->right[j] = sorted_edges[j].right; edges->parent[j] = sorted_edges[j].parent; edges->child[j] = sorted_edges[j].child; } return 0; } void test_edge_sorting() { std::cout << "test_edge_sorting" << endl; tsk_table_collection_t tables; tsk_id_t n = 10; tsk_id_t j; int ret = tsk_table_collection_init(&tables, 0); assert(ret == 0); tables.sequence_length = 1.0; /* Make a stick tree */ /* Add nodes and edges */ for (j = 0; j < n; j++) { ret = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, j + 1, TSK_NULL, TSK_NULL, NULL, 0); assert(ret == j); } for (j = n - 1; j > 0; j--) { tsk_edge_table_add_row(&tables.edges, 0, 1, j, j - 1, NULL, 0); } assert(tables.nodes.num_rows == (tsk_size_t) n); assert(tables.edges.num_rows == (tsk_size_t) n - 1); /* Make sure the edges are unsorted */ /* Not calling TSK_CHECK_TREES so casting is safe */ ret = (int) tsk_table_collection_check_integrity(&tables, TSK_CHECK_EDGE_ORDERING); assert(ret == TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME); /* Sort the tables */ tsk_table_sorter_t sorter; ret = tsk_table_sorter_init(&sorter, &tables, 0); assert(ret == 0); /* Set the sort_edges to our local C++ version. We could also set some * persistent state in sorter.params if we wanted to. */ sorter.sort_edges = cpp_sort_edges; ret = tsk_table_sorter_run(&sorter, NULL); assert(ret == 0); tsk_table_sorter_free(&sorter); /* Make sure the edges are now sorted */ ret = (int) tsk_table_collection_check_integrity(&tables, TSK_CHECK_EDGE_ORDERING); assert(ret == 0); tsk_table_collection_free(&tables); } int sort_edges_raises_exception(tsk_table_sorter_t *sorter, tsk_size_t start) { throw std::exception(); return 0; } int sort_edges_raises_non_exception(tsk_table_sorter_t *sorter, tsk_size_t start) { throw 42; return 0; } int safe_sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start) { int ret = 0; if (sorter->user_data == NULL) { try { ret = sort_edges_raises_exception(sorter, start); } catch (...) { ret = -12345; } } else { try { ret = sort_edges_raises_non_exception(sorter, start); } catch (...) { ret = -12346; } } return ret; } void test_edge_sorting_errors() { /* Some inexplicable error happened here on 32 bit Windows where the * exceptions were not being caught as expected. This seems much * more likely to be a platform quirk that a real bug in our code, * so just disabling the test there. * * https://github.com/tskit-dev/tskit/issues/1790 * https://github.com/tskit-dev/tskit/pull/1791 */ #if !defined(_WIN32) std::cout << "test_edge_sorting_errors" << endl; tsk_table_collection_t tables; tsk_table_sorter_t sorter; tsk_id_t ret = tsk_table_collection_init(&tables, 0); assert(ret == 0); tables.sequence_length = 1.0; ret = tsk_table_sorter_init(&sorter, &tables, 0); assert(ret == 0); sorter.sort_edges = safe_sort_edges; ret = tsk_table_sorter_run(&sorter, NULL); assert(ret == -12345); /* Use the user_data as a way to communicate with the sorter * function. Here, we want to try out two different types * of exception that get thrown. */ sorter.user_data = &tables; ret = tsk_table_sorter_run(&sorter, NULL); assert(ret == -12346); tsk_table_sorter_free(&sorter); tsk_table_collection_free(&tables); #endif } int main() { test_kas_strerror(); test_strerror(); test_load_error(); test_table_basics(); test_edge_sorting(); test_edge_sorting_errors(); return 0; } ================================================ FILE: c/tests/test_stats.c ================================================ /* * MIT License * * Copyright (c) 2019-2024 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "testlib.h" #include #include #include #include #include static bool multi_mutations_exist(tsk_treeseq_t *ts, tsk_id_t start, tsk_id_t end) { int ret; tsk_id_t j; tsk_site_t site; for (j = start; j < TSK_MIN((tsk_id_t) tsk_treeseq_get_num_sites(ts), end); j++) { ret = tsk_treeseq_get_site(ts, j, &site); CU_ASSERT_EQUAL_FATAL(ret, 0); if (site.mutations_length > 1) { return true; } } return false; } static void verify_ld(tsk_treeseq_t *ts) { int ret; tsk_size_t num_sites = tsk_treeseq_get_num_sites(ts); tsk_site_t *sites = tsk_malloc(num_sites * sizeof(tsk_site_t)); int *num_site_mutations = tsk_malloc(num_sites * sizeof(int)); tsk_ld_calc_t ld_calc; double *r2, *r2_prime, x; tsk_id_t j; tsk_size_t num_r2_values; double eps = 1e-6; r2 = tsk_calloc(num_sites, sizeof(double)); r2_prime = tsk_calloc(num_sites, sizeof(double)); CU_ASSERT_FATAL(r2 != NULL); CU_ASSERT_FATAL(r2_prime != NULL); CU_ASSERT_FATAL(sites != NULL); CU_ASSERT_FATAL(num_site_mutations != NULL); ret = tsk_ld_calc_init(&ld_calc, ts); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_ld_calc_print_state(&ld_calc, _devnull); for (j = 0; j < (tsk_id_t) num_sites; j++) { ret = tsk_treeseq_get_site(ts, j, sites + j); CU_ASSERT_EQUAL_FATAL(ret, 0); num_site_mutations[j] = (int) sites[j].mutations_length; ret = tsk_ld_calc_get_r2(&ld_calc, j, j, &x); if (num_site_mutations[j] <= 1) { CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(x, 1.0, eps); } else { CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ONLY_INFINITE_SITES); } } if (num_sites > 0) { /* Some checks in the forward direction */ ret = tsk_ld_calc_get_r2_array( &ld_calc, 0, TSK_DIR_FORWARD, num_sites, DBL_MAX, r2, &num_r2_values); if (multi_mutations_exist(ts, 0, (tsk_id_t) num_sites)) { CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ONLY_INFINITE_SITES); } else { CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_r2_values, num_sites - 1); } tsk_ld_calc_print_state(&ld_calc, _devnull); ret = tsk_ld_calc_get_r2_array(&ld_calc, (tsk_id_t) num_sites - 2, TSK_DIR_FORWARD, num_sites, DBL_MAX, r2_prime, &num_r2_values); if (multi_mutations_exist(ts, (tsk_id_t) num_sites - 2, (tsk_id_t) num_sites)) { CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ONLY_INFINITE_SITES); } else { CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_r2_values, 1); } tsk_ld_calc_print_state(&ld_calc, _devnull); ret = tsk_ld_calc_get_r2_array( &ld_calc, 0, TSK_DIR_FORWARD, num_sites, DBL_MAX, r2_prime, &num_r2_values); if (multi_mutations_exist(ts, 0, (tsk_id_t) num_sites)) { CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ONLY_INFINITE_SITES); } else { CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_r2_values, num_sites - 1); for (j = 0; j < (tsk_id_t) num_r2_values; j++) { CU_ASSERT_EQUAL_FATAL(r2[j], r2_prime[j]); ret = tsk_ld_calc_get_r2(&ld_calc, 0, j + 1, &x); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(r2[j], x, eps); } } /* Some checks in the reverse direction */ ret = tsk_ld_calc_get_r2_array(&ld_calc, (tsk_id_t) num_sites - 1, TSK_DIR_REVERSE, num_sites, DBL_MAX, r2, &num_r2_values); if (multi_mutations_exist(ts, 0, (tsk_id_t) num_sites)) { CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ONLY_INFINITE_SITES); } else { CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_r2_values, num_sites - 1); } tsk_ld_calc_print_state(&ld_calc, _devnull); ret = tsk_ld_calc_get_r2_array( &ld_calc, 1, TSK_DIR_REVERSE, num_sites, DBL_MAX, r2_prime, &num_r2_values); if (multi_mutations_exist(ts, 0, 2)) { CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ONLY_INFINITE_SITES); } else { CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_r2_values, 1); } ret = tsk_ld_calc_get_r2_array(&ld_calc, (tsk_id_t) num_sites - 1, TSK_DIR_REVERSE, num_sites, DBL_MAX, r2_prime, &num_r2_values); if (multi_mutations_exist(ts, 0, (tsk_id_t) num_sites)) { CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ONLY_INFINITE_SITES); } else { CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_r2_values, num_sites - 1); tsk_ld_calc_print_state(&ld_calc, _devnull); for (j = 0; j < (tsk_id_t) num_r2_values; j++) { CU_ASSERT_EQUAL_FATAL(r2[j], r2_prime[j]); ret = tsk_ld_calc_get_r2(&ld_calc, (tsk_id_t) num_sites - 1, (tsk_id_t) num_sites - j - 2, &x); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(r2[j], x, eps); } } /* Check some error conditions */ ret = tsk_ld_calc_get_r2_array( &ld_calc, 0, 0, num_sites, DBL_MAX, r2, &num_r2_values); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); } /* Check some error conditions */ for (j = (tsk_id_t) num_sites; j < (tsk_id_t) num_sites + 2; j++) { ret = tsk_ld_calc_get_r2_array( &ld_calc, j, TSK_DIR_FORWARD, num_sites, DBL_MAX, r2, &num_r2_values); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); ret = tsk_ld_calc_get_r2(&ld_calc, j, 0, r2); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); ret = tsk_ld_calc_get_r2(&ld_calc, 0, j, r2); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); } tsk_ld_calc_free(&ld_calc); free(r2); free(r2_prime); free(sites); free(num_site_mutations); } /* FIXME: this test is weak and should check the return value somehow. * We should also have simplest and single tree tests along with separate * tests for the error conditions. This should be done as part of the general * stats framework. */ static void verify_genealogical_nearest_neighbours(tsk_treeseq_t *ts) { int ret; const tsk_id_t *samples; const tsk_id_t *sample_sets[2]; tsk_size_t sample_set_size[2]; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); double *A = tsk_malloc(2 * num_samples * sizeof(double)); CU_ASSERT_FATAL(A != NULL); samples = tsk_treeseq_get_samples(ts); sample_sets[0] = samples; sample_set_size[0] = num_samples / 2; sample_sets[1] = samples + sample_set_size[0]; sample_set_size[1] = num_samples - sample_set_size[0]; ret = tsk_treeseq_genealogical_nearest_neighbours( ts, samples, num_samples, sample_sets, sample_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, 0); sample_sets[0] = samples; sample_set_size[0] = 1; sample_sets[1] = samples + 1; sample_set_size[1] = 1; ret = tsk_treeseq_genealogical_nearest_neighbours( ts, samples, num_samples, sample_sets, sample_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, 0); free(A); } /* FIXME: this test is weak and should check the return value somehow. * We should also have simplest and single tree tests along with separate * tests for the error conditions. This should be done as part of the general * stats framework. */ static void verify_mean_descendants(tsk_treeseq_t *ts) { int ret; tsk_id_t *samples; const tsk_id_t *sample_sets[2]; tsk_size_t sample_set_size[2]; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); double *C = tsk_malloc(2 * tsk_treeseq_get_num_nodes(ts) * sizeof(double)); CU_ASSERT_FATAL(C != NULL); samples = tsk_malloc(num_samples * sizeof(*samples)); tsk_memcpy(samples, tsk_treeseq_get_samples(ts), num_samples * sizeof(*samples)); sample_sets[0] = samples; sample_set_size[0] = num_samples / 2; sample_sets[1] = samples + sample_set_size[0]; sample_set_size[1] = num_samples - sample_set_size[0]; ret = tsk_treeseq_mean_descendants(ts, sample_sets, sample_set_size, 2, 0, C); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Check some error conditions */ ret = tsk_treeseq_mean_descendants(ts, sample_sets, sample_set_size, 0, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); samples[0] = -1; ret = tsk_treeseq_mean_descendants(ts, sample_sets, sample_set_size, 2, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = (tsk_id_t) tsk_treeseq_get_num_nodes(ts) + 1; ret = tsk_treeseq_mean_descendants(ts, sample_sets, sample_set_size, 2, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); free(samples); free(C); } /* Check the divergence matrix by running against the stats API equivalent * code. */ static void verify_divergence_matrix(tsk_treeseq_t *ts, tsk_flags_t options) { int ret; const tsk_size_t n = tsk_treeseq_get_num_samples(ts); const tsk_id_t *samples = tsk_treeseq_get_samples(ts); tsk_size_t sample_set_sizes[n]; tsk_id_t index_tuples[2 * n * n]; double D1[n * n], D2[n * n]; tsk_size_t i, j, k; for (j = 0; j < n; j++) { sample_set_sizes[j] = 1; for (k = 0; k < n; k++) { index_tuples[2 * (j * n + k)] = (tsk_id_t) j; index_tuples[2 * (j * n + k) + 1] = (tsk_id_t) k; } } ret = tsk_treeseq_divergence( ts, n, sample_set_sizes, samples, n * n, index_tuples, 0, NULL, options, D1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_divergence_matrix( ts, n, sample_set_sizes, samples, 0, NULL, options, D2); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < n; j++) { for (k = 0; k < n; k++) { i = j * n + k; /* printf("%d\t%d\t%f\t%f\n", (int) j, (int) k, D1[i], D2[i]); */ if (j == k) { CU_ASSERT_EQUAL(D2[i], 0); } else { CU_ASSERT_DOUBLE_EQUAL(D1[i], D2[i], 1E-6); } } } } /* Check coalescence counts */ static void verify_pair_coalescence_counts(tsk_treeseq_t *ts, tsk_flags_t options) { int ret; const tsk_size_t n = tsk_treeseq_get_num_samples(ts); const tsk_size_t N = tsk_treeseq_get_num_nodes(ts); const tsk_size_t T = tsk_treeseq_get_num_trees(ts); const tsk_id_t *samples = tsk_treeseq_get_samples(ts); const double *breakpoints = tsk_treeseq_get_breakpoints(ts); const tsk_size_t P = 2; const tsk_size_t I = P * (P + 1) / 2; const tsk_size_t B = 8; tsk_id_t sample_sets[n]; tsk_size_t sample_set_sizes[P]; tsk_id_t index_tuples[2 * I]; tsk_id_t node_bin_map[N]; tsk_size_t dim = T * N * I; double C[dim]; double C_B[T * B * I]; double C_Nh[T * (N / 2) * I]; tsk_size_t i, j, k; for (i = 0; i < n; i++) { sample_sets[i] = samples[i]; } for (i = 0; i < P; i++) { sample_set_sizes[i] = 0; } for (j = 0; j < n; j++) { i = j / ((n + P - 1) / P); sample_set_sizes[i]++; } for (j = 0, i = 0; j < P; j++) { for (k = j; k < P; k++) { index_tuples[i++] = (tsk_id_t) j; index_tuples[i++] = (tsk_id_t) k; } } /* test various bin assignments */ for (i = 0; i < N; i++) { node_bin_map[i] = ((tsk_id_t) (i % B)); } ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, options, C_B); CU_ASSERT_EQUAL_FATAL(ret, 0); for (i = 0; i < N; i++) { node_bin_map[i] = i < N / 2 ? ((tsk_id_t) i) : TSK_NULL; } ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, N / 2, node_bin_map, options, C_Nh); CU_ASSERT_EQUAL_FATAL(ret, 0); for (i = 0; i < N; i++) { node_bin_map[i] = (tsk_id_t) i; } ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, N, node_bin_map, options, C); CU_ASSERT_EQUAL_FATAL(ret, 0); /* cover errors */ double bad_breakpoints[2] = { breakpoints[1], 0.0 }; ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, 1, bad_breakpoints, N, node_bin_map, options, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); index_tuples[0] = (tsk_id_t) P; ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, N, node_bin_map, options, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX); index_tuples[0] = 0; tsk_size_t tmp = sample_set_sizes[0]; sample_set_sizes[0] = 0; ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, N, node_bin_map, options, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EMPTY_SAMPLE_SET); sample_set_sizes[0] = tmp; sample_sets[1] = 0; ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, N, node_bin_map, options, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); sample_sets[1] = 1; ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, N - 1, node_bin_map, options, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_BIN_MAP_DIM); ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, 0, node_bin_map, options, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_BIN_MAP_DIM); node_bin_map[0] = -2; ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, N, node_bin_map, options, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_BIN_MAP); node_bin_map[0] = 0; } /* Check coalescence quantiles */ static void verify_pair_coalescence_quantiles(tsk_treeseq_t *ts) { int ret; const tsk_size_t n = tsk_treeseq_get_num_samples(ts); const tsk_size_t N = tsk_treeseq_get_num_nodes(ts); const tsk_size_t T = tsk_treeseq_get_num_trees(ts); const tsk_id_t *samples = tsk_treeseq_get_samples(ts); const double *breakpoints = tsk_treeseq_get_breakpoints(ts); const double *nodes_time = ts->tables->nodes.time; const double max_time = ts->max_time; const tsk_size_t P = 2; const tsk_size_t Q = 5; const tsk_size_t B = 4; const tsk_size_t I = P * (P + 1) / 2; double quantiles[] = { 0.0, 0.25, 0.5, 0.75, 1.0 }; double epochs[] = { 0.0, max_time / 4, max_time / 2, max_time, INFINITY }; tsk_id_t sample_sets[n]; tsk_size_t sample_set_sizes[P]; tsk_id_t index_tuples[2 * I]; tsk_id_t node_bin_map[N]; tsk_id_t node_bin_map_empty[N]; tsk_id_t node_bin_map_shuff[N]; tsk_size_t dim = T * Q * I; double C[dim]; tsk_size_t i, j, k; for (i = 0; i < N; i++) { node_bin_map_empty[i] = TSK_NULL; node_bin_map_shuff[i] = (tsk_id_t) (i % B); for (j = 0; j < B; j++) { if (nodes_time[i] >= epochs[j] && nodes_time[i] < epochs[j + 1]) { node_bin_map[i] = (tsk_id_t) j; } } } for (i = 0; i < n; i++) { sample_sets[i] = samples[i]; } for (i = 0; i < P; i++) { sample_set_sizes[i] = 0; } for (j = 0; j < n; j++) { i = j / (n / P); sample_set_sizes[i]++; } for (j = 0, i = 0; j < P; j++) { for (k = j; k < P; k++) { index_tuples[i++] = (tsk_id_t) j; index_tuples[i++] = (tsk_id_t) k; } } ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C); CU_ASSERT_EQUAL_FATAL(ret, 0); quantiles[Q - 1] = 0.9; ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C); CU_ASSERT_EQUAL_FATAL(ret, 0); quantiles[Q - 1] = 1.0; ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map_empty, Q, quantiles, 0, C); CU_ASSERT_EQUAL_FATAL(ret, 0); /* cover errors */ quantiles[0] = -1.0; ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_QUANTILES); quantiles[0] = 0.0; quantiles[Q - 1] = 2.0; ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_QUANTILES); quantiles[Q - 1] = 1.0; quantiles[1] = 0.0; quantiles[0] = 0.25; ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_QUANTILES); quantiles[0] = 0.0; quantiles[1] = 0.25; ts->tables->nodes.time[N - 1] = -1.0; ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map_shuff, Q, quantiles, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_TIMES); ts->tables->nodes.time[N - 1] = max_time; node_bin_map[0] = (tsk_id_t) B; ret = tsk_treeseq_pair_coalescence_quantiles(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, Q, quantiles, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_BIN_MAP_DIM); node_bin_map[0] = 0; } /* Check coalescence rates */ static void verify_pair_coalescence_rates(tsk_treeseq_t *ts) { int ret; const tsk_size_t n = tsk_treeseq_get_num_samples(ts); const tsk_size_t N = tsk_treeseq_get_num_nodes(ts); const tsk_size_t T = tsk_treeseq_get_num_trees(ts); const tsk_id_t *samples = tsk_treeseq_get_samples(ts); const double *breakpoints = tsk_treeseq_get_breakpoints(ts); const double *nodes_time = ts->tables->nodes.time; const double max_time = ts->max_time; const tsk_size_t P = 2; const tsk_size_t B = 5; const tsk_size_t I = P * (P + 1) / 2; double epochs[] = { 0.0, max_time / 4, max_time / 2, max_time, max_time * 2, INFINITY }; tsk_id_t sample_sets[n]; tsk_size_t sample_set_sizes[P]; tsk_id_t index_tuples[2 * I]; tsk_id_t node_bin_map[N]; tsk_id_t empty_node_bin_map[N]; tsk_size_t dim = T * B * I; double C[dim]; tsk_size_t i, j, k; for (i = 0; i < N; i++) { node_bin_map[i] = TSK_NULL; for (j = 0; j < B; j++) { if (nodes_time[i] >= epochs[j] && nodes_time[i] < epochs[j + 1]) { node_bin_map[i] = (tsk_id_t) j; } } empty_node_bin_map[i] = TSK_NULL; } for (i = 0; i < n; i++) { sample_sets[i] = samples[i]; } for (i = 0; i < P; i++) { sample_set_sizes[i] = 0; } for (j = 0; j < n; j++) { i = j / (n / P); sample_set_sizes[i]++; } for (j = 0, i = 0; j < P; j++) { for (k = j; k < P; k++) { index_tuples[i++] = (tsk_id_t) j; index_tuples[i++] = (tsk_id_t) k; } } ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, 0); node_bin_map[0] = TSK_NULL; ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, 0); node_bin_map[0] = 0; ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, empty_node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, 0); /* cover errors */ ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, 0, node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TIME_WINDOWS_DIM); epochs[0] = epochs[1] / 2; ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_PAIR_TIMES); epochs[0] = 0.0; epochs[2] = epochs[1]; ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TIME_WINDOWS); epochs[2] = max_time / 2; epochs[B] = DBL_MAX; ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TIME_WINDOWS_END); epochs[B] = INFINITY; node_bin_map[0] = (tsk_id_t) B; ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_BIN_MAP_DIM); node_bin_map[0] = 0; node_bin_map[0] = (tsk_id_t) (B - 1); ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_TIME_WINDOW); node_bin_map[0] = 0; node_bin_map[N - 1] = 0; ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_TIME_WINDOW); node_bin_map[N - 1] = 3; tsk_size_t tmp = sample_set_sizes[0]; sample_set_sizes[0] = 0; ret = tsk_treeseq_pair_coalescence_rates(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, epochs, 0, C); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EMPTY_SAMPLE_SET); sample_set_sizes[0] = tmp; } typedef struct { int call_count; int error_on; int error_code; } general_stat_error_params_t; static int general_stat_error(tsk_size_t TSK_UNUSED(K), const double *TSK_UNUSED(X), tsk_size_t M, double *Y, void *params) { int ret = 0; CU_ASSERT_FATAL(M == 1); Y[0] = 0; general_stat_error_params_t *the_params = (general_stat_error_params_t *) params; if (the_params->call_count == the_params->error_on) { ret = the_params->error_code; } the_params->call_count++; return ret; } static void verify_window_errors(tsk_treeseq_t *ts, tsk_flags_t mode) { int ret; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); double *W = tsk_calloc(num_samples, sizeof(double)); /* node mode requires this much space at least */ double *sigma = tsk_calloc(tsk_treeseq_get_num_nodes(ts), sizeof(double)); double windows[] = { 0, 0, 0 }; tsk_flags_t options = mode; /* Window errors */ ret = tsk_treeseq_general_stat( ts, 1, W, 1, general_stat_error, NULL, 0, windows, options, sigma); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NUM_WINDOWS); ret = tsk_treeseq_general_stat( ts, 1, W, 1, general_stat_error, NULL, 2, windows, options, sigma); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); windows[0] = -1; ret = tsk_treeseq_general_stat( ts, 1, W, 1, general_stat_error, NULL, 2, windows, options, sigma); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); windows[1] = -1; ret = tsk_treeseq_general_stat( ts, 1, W, 1, general_stat_error, NULL, 1, windows, options, sigma); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); windows[0] = 10; ret = tsk_treeseq_general_stat( ts, 1, W, 1, general_stat_error, NULL, 2, windows, options, sigma); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); windows[0] = 0; windows[2] = tsk_treeseq_get_sequence_length(ts) + 1; ret = tsk_treeseq_general_stat( ts, 1, W, 1, general_stat_error, NULL, 2, windows, options, sigma); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); windows[0] = 0; windows[1] = -1; windows[2] = tsk_treeseq_get_sequence_length(ts); ret = tsk_treeseq_general_stat( ts, 1, W, 1, general_stat_error, NULL, 2, windows, options, sigma); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); free(W); free(sigma); } static void verify_summary_func_errors(tsk_treeseq_t *ts, tsk_flags_t mode) { int ret; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); double *W = tsk_calloc(num_samples, sizeof(double)); /* We need this much space for NODE mode */ double *sigma = tsk_calloc(tsk_treeseq_get_num_nodes(ts), sizeof(double)); int j; general_stat_error_params_t params; CU_ASSERT_FATAL(W != NULL); /* Errors in the summary function */ j = 1; while (true) { params.call_count = 0; params.error_on = j; params.error_code = -j; ret = tsk_treeseq_general_stat(ts, 1, W, 1, general_stat_error, ¶ms, 0, NULL, TSK_STAT_POLARISED | mode, sigma); if (ret == 0) { break; } CU_ASSERT_EQUAL_FATAL(ret, params.error_code); j++; } CU_ASSERT_FATAL(j > 1); j = 1; while (true) { params.call_count = 0; params.error_on = j; params.error_code = -j; ret = tsk_treeseq_general_stat( ts, 1, W, 1, general_stat_error, ¶ms, 0, NULL, mode, sigma); if (ret == 0) { break; } CU_ASSERT_EQUAL_FATAL(ret, params.error_code); j++; } CU_ASSERT_FATAL(j > 1); free(W); free(sigma); } static void verify_branch_general_stat_errors(tsk_treeseq_t *ts) { verify_summary_func_errors(ts, TSK_STAT_BRANCH); verify_window_errors(ts, TSK_STAT_BRANCH); } static void verify_site_general_stat_errors(tsk_treeseq_t *ts) { verify_window_errors(ts, TSK_STAT_SITE); verify_summary_func_errors(ts, TSK_STAT_SITE); } static void verify_node_general_stat_errors(tsk_treeseq_t *ts) { verify_window_errors(ts, TSK_STAT_NODE); verify_summary_func_errors(ts, TSK_STAT_NODE); } static void verify_one_way_weighted_func_errors(tsk_treeseq_t *ts, one_way_weighted_method *method) { int ret; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); double *weights = tsk_malloc(num_samples * sizeof(double)); double bad_windows[] = { 0, -1 }; double result; tsk_size_t j; for (j = 0; j < num_samples; j++) { weights[j] = 1.0; } ret = method(ts, 0, weights, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_WEIGHTS); ret = method(ts, 1, weights, 1, bad_windows, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); free(weights); } static void verify_one_way_weighted_covariate_func_errors( tsk_treeseq_t *ts, one_way_covariates_method *method) { int ret; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); double *weights = tsk_malloc(num_samples * sizeof(double)); double *covariates = NULL; double bad_windows[] = { 0, -1 }; double result; tsk_size_t j; for (j = 0; j < num_samples; j++) { weights[j] = 1.0; } ret = method(ts, 0, weights, 0, covariates, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_WEIGHTS); ret = method(ts, 1, weights, 0, covariates, 1, bad_windows, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); free(weights); } static void verify_one_way_stat_func_errors(tsk_treeseq_t *ts, one_way_sample_stat_method *method) { int ret; tsk_id_t num_nodes = (tsk_id_t) tsk_treeseq_get_num_nodes(ts); tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes = 4; double windows[] = { 0, 0, 0 }; double result; ret = method(ts, 0, &sample_set_sizes, samples, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); samples[0] = TSK_NULL; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = -10; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = num_nodes; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = num_nodes + 1; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = num_nodes - 1; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLES); samples[0] = 1; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); samples[0] = 0; sample_set_sizes = 0; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EMPTY_SAMPLE_SET); sample_set_sizes = 4; /* Window errors */ ret = method(ts, 1, &sample_set_sizes, samples, 0, windows, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NUM_WINDOWS); ret = method(ts, 1, &sample_set_sizes, samples, 2, windows, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); } // Temporary definition for time_windows in tsk_treeseq_allele_frequency_spectrum typedef int one_way_sample_stat_method_tw(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_size_t num_time_windows, const double *time_windows, tsk_flags_t options, double *result); // Temporary duplicate for time-windows-having methods static void verify_one_way_stat_func_errors_tw( tsk_treeseq_t *ts, one_way_sample_stat_method_tw *method) { int ret; tsk_id_t num_nodes = (tsk_id_t) tsk_treeseq_get_num_nodes(ts); tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes = 4; double windows[] = { 0, 0, 0 }; double time_windows[] = { -1, 0.5, INFINITY }; double result; ret = method(ts, 0, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); samples[0] = TSK_NULL; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = -10; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = num_nodes; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = num_nodes + 1; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = num_nodes - 1; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLES); samples[0] = 1; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); samples[0] = 0; sample_set_sizes = 0; ret = method(ts, 1, &sample_set_sizes, samples, 0, NULL, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EMPTY_SAMPLE_SET); sample_set_sizes = 4; /* Window errors */ ret = method(ts, 1, &sample_set_sizes, samples, 0, windows, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NUM_WINDOWS); ret = method(ts, 1, &sample_set_sizes, samples, 2, windows, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); /* Time window errors */ ret = method( ts, 1, &sample_set_sizes, samples, 0, NULL, 0, time_windows, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TIME_WINDOWS_DIM); ret = method( ts, 1, &sample_set_sizes, samples, 0, NULL, 2, time_windows, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TIME_WINDOWS); time_windows[0] = 0.1; ret = method( ts, 1, &sample_set_sizes, samples, 0, NULL, 2, time_windows, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TIME_WINDOWS); time_windows[0] = 0; time_windows[1] = 0; ret = method( ts, 1, &sample_set_sizes, samples, 0, NULL, 2, time_windows, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TIME_WINDOWS); } static void verify_two_way_stat_func_errors( tsk_treeseq_t *ts, general_sample_stat_method *method, tsk_flags_t options) { int ret; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 2, 2 }; tsk_id_t set_indexes[] = { 0, 1 }; double result; ret = method(ts, 0, sample_set_sizes, samples, 1, set_indexes, 0, NULL, options | TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); ret = method(ts, 2, sample_set_sizes, samples, 0, set_indexes, 0, NULL, options | TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_INDEX_TUPLES); set_indexes[0] = -1; ret = method(ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, options | TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX); set_indexes[0] = 0; set_indexes[1] = 2; ret = method(ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, options | TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX); } static void verify_two_way_weighted_stat_func_errors( tsk_treeseq_t *ts, two_way_weighted_method *method, tsk_flags_t options) { int ret; tsk_id_t indexes[] = { 0, 0, 0, 1 }; double bad_windows[] = { -1, -1 }; double weights[10]; double result[10]; memset(weights, 0, sizeof(weights)); ret = method(ts, 2, weights, 2, indexes, 0, NULL, result, options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = method(ts, 2, weights, 2, indexes, 0, NULL, result, options | TSK_STAT_SITE | TSK_STAT_NODE); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES); ret = method(ts, 0, weights, 2, indexes, 0, NULL, result, options); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_WEIGHTS); ret = method(ts, 2, weights, 2, indexes, 1, bad_windows, result, options); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); } static void verify_three_way_stat_func_errors(tsk_treeseq_t *ts, general_sample_stat_method *method) { int ret; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 1, 1, 2 }; tsk_id_t set_indexes[] = { 0, 1, 2 }; double result; ret = method(ts, 0, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); ret = method(ts, 3, sample_set_sizes, samples, 0, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_INDEX_TUPLES); set_indexes[0] = -1; ret = method(ts, 3, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX); set_indexes[0] = 0; set_indexes[1] = 3; ret = method(ts, 3, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX); } static void verify_four_way_stat_func_errors(tsk_treeseq_t *ts, general_sample_stat_method *method) { int ret; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 1, 1, 1, 1 }; tsk_id_t set_indexes[] = { 0, 1, 2, 3 }; double result; ret = method(ts, 0, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); ret = method(ts, 4, sample_set_sizes, samples, 0, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_INDEX_TUPLES); set_indexes[0] = -1; ret = method(ts, 4, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX); set_indexes[0] = 0; set_indexes[1] = 4; ret = method(ts, 4, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX); } static int general_stat_identity( tsk_size_t K, const double *restrict X, tsk_size_t M, double *Y, void *params) { tsk_size_t k; CU_ASSERT_FATAL(M == K); CU_ASSERT_FATAL(params == NULL); for (k = 0; k < K; k++) { Y[k] = X[k]; } return 0; } static void verify_branch_general_stat_identity(tsk_treeseq_t *ts) { CU_ASSERT_FATAL(ts != NULL); int ret; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); double *W = tsk_malloc(num_samples * sizeof(double)); tsk_id_t *nodes = tsk_malloc(tsk_treeseq_get_num_nodes(ts) * sizeof(*nodes)); tsk_id_t u; tsk_size_t num_nodes; double s, branch_length; double *sigma = tsk_malloc(tsk_treeseq_get_num_trees(ts) * sizeof(*sigma)); tsk_tree_t tree; tsk_size_t j; CU_ASSERT_FATAL(W != NULL); CU_ASSERT_FATAL(nodes != NULL); for (j = 0; j < num_samples; j++) { W[j] = 1; } ret = tsk_treeseq_general_stat(ts, 1, W, 1, general_stat_identity, NULL, tsk_treeseq_get_num_trees(ts), tsk_treeseq_get_breakpoints(ts), TSK_STAT_BRANCH | TSK_STAT_POLARISED | TSK_STAT_SPAN_NORMALISE, sigma); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_init(&tree, ts, 0); CU_ASSERT_EQUAL(ret, 0); for (ret = tsk_tree_first(&tree); ret == TSK_TREE_OK; ret = tsk_tree_next(&tree)) { ret = tsk_tree_preorder(&tree, nodes, &num_nodes); CU_ASSERT_EQUAL_FATAL(ret, 0); s = 0; for (j = 0; j < num_nodes; j++) { u = nodes[j]; ret = tsk_tree_get_branch_length(&tree, u, &branch_length); CU_ASSERT_EQUAL_FATAL(ret, 0); s += branch_length * (double) tree.num_samples[u]; } CU_ASSERT_DOUBLE_EQUAL_FATAL(sigma[tree.index], s, 1e-6); } CU_ASSERT_EQUAL_FATAL(ret, 0); free(nodes); tsk_tree_free(&tree); free(W); free(sigma); } static int general_stat_sum( tsk_size_t K, const double *restrict X, tsk_size_t M, double *Y, void *params) { tsk_size_t k, m; double s = 0; CU_ASSERT_FATAL(params == NULL); s = 0; for (k = 0; k < K; k++) { s += X[k]; } for (m = 0; m < M; m++) { Y[m] = s; } return 0; } static void verify_general_stat_dims( tsk_treeseq_t *ts, tsk_size_t K, tsk_size_t M, tsk_flags_t options) { int ret; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); double *W = tsk_malloc(K * num_samples * sizeof(double)); /* We need this much space for NODE mode; no harm for other modes. */ double *sigma = tsk_calloc(tsk_treeseq_get_num_nodes(ts) * M, sizeof(double)); tsk_size_t j, k; CU_ASSERT_FATAL(W != NULL); for (j = 0; j < num_samples; j++) { for (k = 0; k < K; k++) { W[j * K + k] = 1; } } ret = tsk_treeseq_general_stat( ts, K, W, M, general_stat_sum, NULL, 0, NULL, options, sigma); CU_ASSERT_EQUAL_FATAL(ret, 0); free(W); free(sigma); } static void verify_general_stat_windows( tsk_treeseq_t *ts, tsk_size_t num_windows, tsk_flags_t options) { int ret; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); double *W = tsk_malloc(num_samples * sizeof(double)); tsk_size_t M = 5; /* We need this much space for NODE mode; no harm for other modes. */ double *sigma = tsk_calloc(M * tsk_treeseq_get_num_nodes(ts) * num_windows, sizeof(double)); double *windows = tsk_malloc((num_windows + 1) * sizeof(*windows)); double L = tsk_treeseq_get_sequence_length(ts); tsk_size_t j; CU_ASSERT_FATAL(W != NULL); CU_ASSERT_FATAL(sigma != NULL); CU_ASSERT_FATAL(windows != NULL); for (j = 0; j < num_samples; j++) { W[j] = 1; } windows[0] = 0; windows[num_windows] = L; for (j = 1; j < num_windows; j++) { windows[j] = ((double) j) * L / (double) num_windows; } ret = tsk_treeseq_general_stat( ts, 1, W, M, general_stat_sum, NULL, num_windows, windows, options, sigma); CU_ASSERT_EQUAL_FATAL(ret, 0); free(W); free(sigma); free(windows); } static void verify_default_general_stat(tsk_treeseq_t *ts) { int ret; tsk_size_t K = 2; tsk_size_t M = 1; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); double *W = tsk_malloc(K * num_samples * sizeof(double)); double sigma1, sigma2; tsk_size_t j, k; CU_ASSERT_FATAL(W != NULL); for (j = 0; j < num_samples; j++) { for (k = 0; k < K; k++) { W[j * K + k] = 1; } } ret = tsk_treeseq_general_stat( ts, K, W, M, general_stat_sum, NULL, 0, NULL, TSK_STAT_SITE, &sigma1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_general_stat( ts, K, W, M, general_stat_sum, NULL, 0, NULL, 0, &sigma2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(sigma1, sigma2); free(W); } static void verify_general_stat(tsk_treeseq_t *ts, tsk_flags_t mode) { CU_ASSERT_FATAL(ts != NULL); verify_general_stat_dims(ts, 4, 2, mode); verify_general_stat_dims(ts, 4, 2, mode | TSK_STAT_POLARISED); verify_general_stat_dims(ts, 1, 20, mode); verify_general_stat_dims(ts, 1, 20, mode | TSK_STAT_POLARISED); verify_general_stat_dims(ts, 100, 1, mode); verify_general_stat_dims(ts, 100, 1, mode | TSK_STAT_POLARISED); verify_general_stat_dims(ts, 10, 12, mode); verify_general_stat_dims(ts, 10, 12, mode | TSK_STAT_POLARISED); verify_general_stat_windows(ts, 1, mode); verify_general_stat_windows(ts, 1, mode | TSK_STAT_SPAN_NORMALISE); verify_general_stat_windows(ts, 2, mode); verify_general_stat_windows(ts, 2, mode | TSK_STAT_SPAN_NORMALISE); verify_general_stat_windows(ts, 3, mode); verify_general_stat_windows(ts, 3, mode | TSK_STAT_SPAN_NORMALISE); verify_general_stat_windows(ts, 10, mode); verify_general_stat_windows(ts, 10, mode | TSK_STAT_SPAN_NORMALISE); verify_general_stat_windows(ts, 100, mode); verify_general_stat_windows(ts, 100, mode | TSK_STAT_SPAN_NORMALISE); } static void verify_afs(tsk_treeseq_t *ts) { int ret; tsk_size_t n = tsk_treeseq_get_num_samples(ts); tsk_size_t sample_set_sizes[2]; double time_windows[] = { 0, 1 }; const tsk_id_t *samples = tsk_treeseq_get_samples(ts); double *result = tsk_malloc(n * n * sizeof(*result)); CU_ASSERT_FATAL(sample_set_sizes != NULL); sample_set_sizes[0] = n - 2; sample_set_sizes[1] = 2; ret = tsk_treeseq_allele_frequency_spectrum( ts, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_allele_frequency_spectrum( ts, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_POLARISED, result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_allele_frequency_spectrum(ts, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_POLARISED | TSK_STAT_SPAN_NORMALISE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_allele_frequency_spectrum(ts, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_POLARISED | TSK_STAT_SPAN_NORMALISE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_allele_frequency_spectrum(ts, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_allele_frequency_spectrum(ts, 2, sample_set_sizes, samples, 0, NULL, 1, time_windows, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); free(result); } static void test_general_stat_input_errors(void) { tsk_treeseq_t ts; double result; double W; int ret; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); /* Bad input dimensions */ ret = tsk_treeseq_general_stat( &ts, 0, &W, 1, general_stat_sum, NULL, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_STATE_DIMS); ret = tsk_treeseq_general_stat( &ts, 1, &W, 0, general_stat_sum, NULL, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_RESULT_DIMS); /* Multiple stats*/ ret = tsk_treeseq_general_stat(&ts, 1, &W, 1, general_stat_sum, NULL, 0, NULL, TSK_STAT_SITE | TSK_STAT_BRANCH, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES); ret = tsk_treeseq_general_stat(&ts, 1, &W, 1, general_stat_sum, NULL, 0, NULL, TSK_STAT_SITE | TSK_STAT_NODE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES); ret = tsk_treeseq_general_stat(&ts, 1, &W, 1, general_stat_sum, NULL, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_NODE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES); tsk_treeseq_free(&ts); } static void test_empty_ts_ld(void) { tsk_treeseq_t ts; tsk_treeseq_from_text( &ts, 1, single_tree_ex_nodes, "", NULL, NULL, NULL, NULL, NULL, 0); verify_ld(&ts); tsk_treeseq_free(&ts); } static void test_empty_ts_mean_descendants(void) { tsk_treeseq_t ts; tsk_treeseq_from_text( &ts, 1, single_tree_ex_nodes, "", NULL, NULL, NULL, NULL, NULL, 0); verify_mean_descendants(&ts); tsk_treeseq_free(&ts); } static void test_empty_ts_genealogical_nearest_neighbours(void) { tsk_treeseq_t ts; tsk_treeseq_from_text( &ts, 1, single_tree_ex_nodes, "", NULL, NULL, NULL, NULL, NULL, 0); verify_genealogical_nearest_neighbours(&ts); tsk_treeseq_free(&ts); } static void test_empty_ts_general_stat(void) { tsk_treeseq_t ts; tsk_treeseq_from_text( &ts, 1, single_tree_ex_nodes, "", NULL, NULL, NULL, NULL, NULL, 0); verify_branch_general_stat_identity(&ts); verify_default_general_stat(&ts); verify_general_stat(&ts, TSK_STAT_BRANCH); verify_general_stat(&ts, TSK_STAT_SITE); verify_general_stat(&ts, TSK_STAT_NODE); tsk_treeseq_free(&ts); } static void test_empty_ts_afs(void) { tsk_treeseq_t ts; tsk_treeseq_from_text( &ts, 1, single_tree_ex_nodes, "", NULL, NULL, NULL, NULL, NULL, 0); verify_afs(&ts); tsk_treeseq_free(&ts); } static void test_single_tree_ld(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); verify_ld(&ts); tsk_treeseq_free(&ts); } static void test_single_tree_mean_descendants(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); verify_mean_descendants(&ts); tsk_treeseq_free(&ts); } static void test_single_tree_genealogical_nearest_neighbours(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); verify_genealogical_nearest_neighbours(&ts); tsk_treeseq_free(&ts); } static void test_single_tree_general_stat(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); verify_branch_general_stat_identity(&ts); verify_default_general_stat(&ts); verify_general_stat(&ts, TSK_STAT_BRANCH); verify_general_stat(&ts, TSK_STAT_SITE); verify_general_stat(&ts, TSK_STAT_NODE); tsk_treeseq_free(&ts); } static void test_single_tree_general_stat_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); verify_branch_general_stat_errors(&ts); verify_site_general_stat_errors(&ts); verify_node_general_stat_errors(&ts); tsk_treeseq_free(&ts); } static void test_single_tree_divergence_matrix(void) { tsk_treeseq_t ts; int ret; double result[16]; double D_branch[16] = { 0, 2, 6, 6, 2, 0, 6, 6, 6, 6, 0, 4, 6, 6, 4, 0 }; double D_site[16] = { 0, 1, 1, 0, 1, 0, 2, 1, 1, 2, 0, 1, 0, 1, 1, 0 }; tsk_size_t sample_set_sizes[] = { 2, 2 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_treeseq_divergence_matrix( &ts, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(16, result, D_branch); ret = tsk_treeseq_divergence_matrix( &ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(16, result, D_site); ret = tsk_treeseq_divergence_matrix( &ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_divergence_matrix( &ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); sample_set_sizes[0] = 3; sample_set_sizes[1] = 1; ret = tsk_treeseq_divergence_matrix( &ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_divergence_matrix( &ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); /* assert_arrays_almost_equal(4, result, D_site); */ verify_divergence_matrix(&ts, TSK_STAT_BRANCH); verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE); verify_divergence_matrix(&ts, TSK_STAT_SITE); verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE); tsk_treeseq_free(&ts); } static void test_single_tree_divergence_matrix_internal_samples(void) { tsk_treeseq_t ts; int ret; double *result = malloc(16 * sizeof(double)); double D[16] = { 0, 2, 4, 3, 2, 0, 4, 3, 4, 4, 0, 1, 3, 3, 1, 0 }; const char *nodes = "1 0 -1 -1\n" /* 2.00┊ 6 ┊ */ "1 0 -1 -1\n" /* ┊ ┏━┻━┓ ┊ */ "1 0 -1 -1\n" /* 1.00┊ 4 5* ┊ */ "0 0 -1 -1\n" /* ┊ ┏┻┓ ┏┻┓ ┊ */ "0 1 -1 -1\n" /* 0.00┊ 0 1 2 3 ┊ */ "1 1 -1 -1\n" /* 0 * * * 1 */ "0 2 -1 -1\n"; const char *edges = "0 1 4 0,1\n" "0 1 5 2,3\n" "0 1 6 4,5\n"; /* One mutations per branch so we get the same as the branch length value */ const char *sites = "0.1 A\n" "0.2 A\n" "0.3 A\n" "0.4 A\n" "0.5 A\n" "0.6 A\n"; const char *mutations = "0 0 T -1\n" "1 1 T -1\n" "2 2 T -1\n" "3 3 T -1\n" "4 4 T -1\n" "5 5 T -1\n"; tsk_id_t samples[] = { 0, 1, 2, 5 }; tsk_size_t sizes[] = { 1, 1, 1, 1 }; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_divergence_matrix( &ts, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(16, result, D); ret = tsk_treeseq_divergence_matrix( &ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(16, result, D); ret = tsk_treeseq_divergence_matrix( &ts, 4, sizes, samples, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(16, result, D); ret = tsk_treeseq_divergence_matrix( &ts, 4, sizes, samples, 0, NULL, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(16, result, D); ret = tsk_treeseq_divergence_matrix( &ts, 4, NULL, samples, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(16, result, D); ret = tsk_treeseq_divergence_matrix( &ts, 4, NULL, samples, 0, NULL, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(16, result, D); verify_divergence_matrix(&ts, TSK_STAT_BRANCH); verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE); verify_divergence_matrix(&ts, TSK_STAT_SITE); verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE); tsk_treeseq_free(&ts); free(result); } static void test_single_tree_divergence_matrix_multi_root(void) { tsk_treeseq_t ts; int ret; double result[16]; double D_branch[16] = { 0, 2, 3, 3, 2, 0, 3, 3, 3, 3, 0, 4, 3, 3, 4, 0 }; const char *nodes = "1 0 -1 -1\n" "1 0 -1 -1\n" /* 2.00┊ 5 ┊ */ "1 0 -1 -1\n" /* 1.00┊ 4 ┊ */ "1 0 -1 -1\n" /* ┊ ┏┻┓ ┏┻┓ ┊ */ "0 1 -1 -1\n" /* 0.00┊ 0 1 2 3 ┊ */ "0 2 -1 -1\n"; /* 0 * * * * 1 */ const char *edges = "0 1 4 0,1\n" "0 1 5 2,3\n"; /* Two mutations per branch */ const char *sites = "0.1 A\n" "0.2 A\n" "0.3 A\n" "0.4 A\n"; const char *mutations = "0 0 B -1\n" "0 0 C 0\n" "1 1 B -1\n" "1 1 C 2\n" "2 2 B -1\n" "2 2 C 4\n" "2 2 D 5\n" "2 2 E 6\n" "3 3 B -1\n" "3 3 C 8\n" "3 3 D 9\n" "3 3 E 10\n"; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_divergence_matrix( &ts, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(16, result, D_branch); verify_divergence_matrix(&ts, TSK_STAT_BRANCH); verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE); verify_divergence_matrix(&ts, TSK_STAT_SITE); verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE); tsk_treeseq_free(&ts); } static void test_paper_ex_ld(void) { tsk_treeseq_t ts; tsk_ld_calc_t ld_calc; double r2[3]; tsk_size_t num_r2_values; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_ld(&ts); /* Check early exit corner cases */ ret = tsk_ld_calc_init(&ld_calc, &ts); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ld_calc_get_r2_array( &ld_calc, 0, TSK_DIR_FORWARD, 1, DBL_MAX, r2, &num_r2_values); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_r2_values, 1); ret = tsk_ld_calc_get_r2_array( &ld_calc, 2, TSK_DIR_REVERSE, 1, DBL_MAX, r2, &num_r2_values); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_r2_values, 1); tsk_ld_calc_free(&ld_calc); tsk_treeseq_free(&ts); } static void test_paper_ex_mean_descendants(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_mean_descendants(&ts); tsk_treeseq_free(&ts); } static void test_paper_ex_genealogical_nearest_neighbours(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_genealogical_nearest_neighbours(&ts); tsk_treeseq_free(&ts); } static void test_paper_ex_general_stat(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_branch_general_stat_identity(&ts); verify_default_general_stat(&ts); verify_general_stat(&ts, TSK_STAT_BRANCH); verify_general_stat(&ts, TSK_STAT_SITE); verify_general_stat(&ts, TSK_STAT_NODE); tsk_treeseq_free(&ts); } static void test_paper_ex_general_stat_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_branch_general_stat_errors(&ts); verify_site_general_stat_errors(&ts); verify_node_general_stat_errors(&ts); tsk_treeseq_free(&ts); } static void test_paper_ex_diversity_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_one_way_stat_func_errors(&ts, tsk_treeseq_diversity); tsk_treeseq_free(&ts); } static void test_paper_ex_diversity(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes = 4; double pi; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_diversity( &ts, 1, &sample_set_sizes, samples, 0, NULL, TSK_STAT_SITE, &pi); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(pi, 1.5, 1e-6); /* A sample set size of 1 leads to NaN */ sample_set_sizes = 1; ret = tsk_treeseq_diversity( &ts, 1, &sample_set_sizes, samples, 0, NULL, TSK_STAT_SITE, &pi); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT(tsk_isnan(pi)); tsk_treeseq_free(&ts); } static void test_paper_ex_trait_covariance_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_one_way_weighted_func_errors(&ts, tsk_treeseq_trait_covariance); tsk_treeseq_free(&ts); } static void test_paper_ex_trait_covariance(void) { tsk_treeseq_t ts; double result; double *weights; tsk_size_t j; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); weights = tsk_malloc(4 * sizeof(double)); weights[0] = weights[1] = 0.0; weights[2] = weights[3] = 1.0; ret = tsk_treeseq_trait_covariance(&ts, 1, weights, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(result, 1.0 / 12.0, 1e-6); /* weights of 0 leads to 0 */ for (j = 0; j < 4; j++) { weights[j] = 0.0; } ret = tsk_treeseq_trait_covariance(&ts, 1, weights, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(result, 0.0, 1e-6); tsk_treeseq_free(&ts); free(weights); } static void test_paper_ex_trait_correlation_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_one_way_weighted_func_errors(&ts, tsk_treeseq_trait_correlation); tsk_treeseq_free(&ts); } static void test_paper_ex_trait_correlation(void) { tsk_treeseq_t ts; double result; double *weights; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); weights = tsk_malloc(4 * sizeof(double)); weights[0] = weights[1] = 0.0; weights[2] = weights[3] = 1.0; ret = tsk_treeseq_trait_correlation( &ts, 1, weights, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(result, 1.0, 1e-6); tsk_treeseq_free(&ts); free(weights); } static void test_paper_ex_trait_linear_model_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_one_way_weighted_covariate_func_errors(&ts, tsk_treeseq_trait_linear_model); tsk_treeseq_free(&ts); } static void test_paper_ex_trait_linear_model(void) { tsk_treeseq_t ts; double result; double *weights; double *covariates; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); weights = tsk_malloc(4 * sizeof(double)); covariates = tsk_malloc(8 * sizeof(double)); weights[0] = weights[1] = 0.0; weights[2] = weights[3] = 1.0; covariates[0] = covariates[1] = 0.0; covariates[2] = covariates[3] = 1.0; covariates[4] = covariates[6] = 0.0; covariates[5] = covariates[7] = 1.0; ret = tsk_treeseq_trait_linear_model( &ts, 1, weights, 2, covariates, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(result, 0.0, 1e-6); tsk_treeseq_free(&ts); free(weights); free(covariates); } static void test_paper_ex_segregating_sites_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_one_way_stat_func_errors(&ts, tsk_treeseq_segregating_sites); tsk_treeseq_free(&ts); } static void test_paper_ex_segregating_sites(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes = 4; double segsites; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_segregating_sites( &ts, 1, &sample_set_sizes, samples, 0, NULL, TSK_STAT_SITE, &segsites); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(segsites, 3.0, 1e-6); /* A sample set size of 1 leads to 0 */ sample_set_sizes = 1; ret = tsk_treeseq_segregating_sites( &ts, 1, &sample_set_sizes, samples, 0, NULL, TSK_STAT_SITE, &segsites); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(segsites, 0.0, 1e-6); tsk_treeseq_free(&ts); } static void test_paper_ex_Y1_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_one_way_stat_func_errors(&ts, tsk_treeseq_Y1); tsk_treeseq_free(&ts); } static void test_paper_ex_Y1(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes = 4; double result; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_Y1(&ts, 1, &sample_set_sizes, samples, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); /* A sample set size of < 2 leads to NaN */ sample_set_sizes = 1; ret = tsk_treeseq_Y1(&ts, 1, &sample_set_sizes, samples, 0, NULL, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT(tsk_isnan(result)); tsk_treeseq_free(&ts); } static void test_paper_ex_divergence_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_two_way_stat_func_errors(&ts, tsk_treeseq_divergence, 0); tsk_treeseq_free(&ts); } static void test_paper_ex_divergence(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 2, 2 }; tsk_id_t set_indexes[] = { 0, 1 }; double result; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_divergence(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); /* sample_set[0] size = 1 with indexes = (0, 0) leads to NaN */ sample_set_sizes[0] = 1; set_indexes[1] = 0; ret = tsk_treeseq_divergence(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT(tsk_isnan(result)); tsk_treeseq_free(&ts); } static void test_paper_ex_genetic_relatedness(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 2, 2 }; tsk_id_t set_indexes[] = { 0, 0 }; double result; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_genetic_relatedness(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_genetic_relatedness(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE | TSK_STAT_NONCENTRED, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); } static void test_paper_ex_genetic_relatedness_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_two_way_stat_func_errors(&ts, tsk_treeseq_genetic_relatedness, 0); verify_two_way_stat_func_errors( &ts, tsk_treeseq_genetic_relatedness, TSK_STAT_NONCENTRED); verify_two_way_stat_func_errors( &ts, tsk_treeseq_genetic_relatedness, TSK_STAT_POLARISED); tsk_treeseq_free(&ts); } static void test_paper_ex_genetic_relatedness_weighted(void) { tsk_treeseq_t ts; double weights[] = { 1.2, 0.1, 0.0, 0.0, 3.4, 5.0, 1.0, -1.0 }; tsk_id_t indexes[] = { 0, 0, 0, 1 }; double result[100]; tsk_size_t num_weights; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); for (num_weights = 1; num_weights < 3; num_weights++) { ret = tsk_treeseq_genetic_relatedness_weighted( &ts, num_weights, weights, 2, indexes, 0, NULL, result, TSK_STAT_SITE); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_genetic_relatedness_weighted( &ts, num_weights, weights, 2, indexes, 0, NULL, result, TSK_STAT_BRANCH); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_genetic_relatedness_weighted( &ts, num_weights, weights, 2, indexes, 0, NULL, result, TSK_STAT_NODE); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_genetic_relatedness_weighted(&ts, num_weights, weights, 2, indexes, 0, NULL, result, TSK_STAT_SITE | TSK_STAT_NONCENTRED); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_genetic_relatedness_weighted(&ts, num_weights, weights, 2, indexes, 0, NULL, result, TSK_STAT_BRANCH | TSK_STAT_NONCENTRED); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_genetic_relatedness_weighted(&ts, num_weights, weights, 2, indexes, 0, NULL, result, TSK_STAT_NODE | TSK_STAT_NONCENTRED); CU_ASSERT_EQUAL_FATAL(ret, 0); } tsk_treeseq_free(&ts); } static void test_paper_ex_genetic_relatedness_weighted_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_two_way_weighted_stat_func_errors( &ts, tsk_treeseq_genetic_relatedness_weighted, 0); verify_two_way_weighted_stat_func_errors( &ts, tsk_treeseq_genetic_relatedness_weighted, TSK_STAT_NONCENTRED); verify_two_way_weighted_stat_func_errors( &ts, tsk_treeseq_genetic_relatedness_weighted, TSK_STAT_POLARISED); tsk_treeseq_free(&ts); } static void test_empty_genetic_relatedness_vector(void) { int ret; tsk_treeseq_t ts; tsk_size_t num_samples; double *weights, *result, *result2; tsk_size_t j; tsk_size_t num_weights = 2; double windows[] = { 0, 0 }; tsk_treeseq_from_text( &ts, 1, single_tree_ex_nodes, "", NULL, NULL, NULL, NULL, NULL, 0); num_samples = tsk_treeseq_get_num_samples(&ts); windows[1] = tsk_treeseq_get_sequence_length(&ts); weights = tsk_malloc(num_weights * num_samples * sizeof(double)); result = tsk_malloc(num_weights * num_samples * sizeof(double)); result2 = tsk_malloc(num_weights * num_samples * sizeof(double)); for (j = 0; j < num_samples; j++) { weights[j] = 1.0; } for (j = 0; j < num_samples; j++) { weights[j + num_samples] = (float) j; } ret = tsk_treeseq_genetic_relatedness_vector( &ts, num_weights, weights, 1, windows, num_samples, ts.samples, result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_genetic_relatedness_vector(&ts, num_weights, weights, 1, windows, num_samples, ts.samples, result, TSK_STAT_NONCENTRED); CU_ASSERT_EQUAL_FATAL(ret, 0); windows[0] = 0.5 * tsk_treeseq_get_sequence_length(&ts); windows[1] = 0.75 * tsk_treeseq_get_sequence_length(&ts); ret = tsk_treeseq_genetic_relatedness_vector( &ts, num_weights, weights, 1, windows, num_samples, ts.samples, result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_genetic_relatedness_vector(&ts, num_weights, weights, 1, windows, num_samples, ts.samples, result2, TSK_STAT_SPAN_NORMALISE); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_samples * num_weights; j++) { CU_ASSERT_EQUAL_FATAL(result[j] / (windows[1] - windows[0]), result2[j]); } tsk_treeseq_free(&ts); free(weights); free(result); free(result2); } static void verify_genetic_relatedness_vector( tsk_treeseq_t *ts, tsk_size_t num_weights, tsk_size_t num_windows) { int ret; tsk_size_t num_samples; double *weights, *result; tsk_size_t j, k; double *windows = tsk_malloc((num_windows + 1) * sizeof(*windows)); double L = tsk_treeseq_get_sequence_length(ts); windows[0] = 0; windows[num_windows] = L; for (j = 1; j < num_windows; j++) { windows[j] = ((double) j) * L / (double) num_windows; } num_samples = tsk_treeseq_get_num_samples(ts); weights = tsk_malloc(num_weights * num_samples * sizeof(*weights)); result = tsk_malloc(num_windows * num_weights * num_samples * sizeof(*result)); for (j = 0; j < num_samples; j++) { for (k = 0; k < num_weights; k++) { weights[j + k * num_samples] = 1.0 + (double) k; } } ret = tsk_treeseq_genetic_relatedness_vector(ts, num_weights, weights, num_windows, windows, num_samples, ts->samples, result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); windows[0] = windows[1] / 2; if (num_windows > 1) { windows[num_windows - 1] = windows[num_windows - 2] + (L / (double) (2 * num_windows)); } ret = tsk_treeseq_genetic_relatedness_vector(ts, num_weights, weights, num_windows, windows, num_samples, ts->samples, result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_genetic_relatedness_vector(ts, num_weights, weights, num_windows, windows, num_samples, ts->samples, result, TSK_STAT_NONCENTRED); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_set_debug_stream(_devnull); ret = tsk_treeseq_genetic_relatedness_vector(ts, num_weights, weights, num_windows, windows, num_samples, ts->samples, result, TSK_DEBUG); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_set_debug_stream(stdout); free(windows); free(weights); free(result); } static void test_paper_ex_genetic_relatedness_vector(void) { tsk_treeseq_t ts; double gap; for (gap = 0.0; gap < 2.0; gap += 1.0) { tsk_treeseq_from_text(&ts, 10 + gap, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); tsk_size_t j, k; for (j = 1; j < 3; j++) { for (k = 1; k < 3; k++) { verify_genetic_relatedness_vector(&ts, j, k); } } tsk_treeseq_free(&ts); } } static void test_paper_ex_genetic_relatedness_vector_errors(void) { int ret; tsk_treeseq_t ts; tsk_size_t num_samples; double *weights, *result; tsk_size_t j; tsk_size_t num_windows = 2; tsk_size_t num_weights = 2; double windows[] = { 0, 0, 0 }; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); num_samples = tsk_treeseq_get_num_samples(&ts); weights = tsk_malloc(num_weights * num_samples * sizeof(double)); result = tsk_malloc(num_windows * num_weights * num_samples * sizeof(double)); for (j = 0; j < num_samples; j++) { weights[j] = 1.0; } for (j = 0; j < num_samples; j++) { weights[j + num_samples] = (float) j; } /* Window errors */ ret = tsk_treeseq_genetic_relatedness_vector( &ts, 1, weights, 0, windows, num_samples, ts.samples, result, TSK_STAT_BRANCH); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NUM_WINDOWS); ret = tsk_treeseq_genetic_relatedness_vector( &ts, 1, weights, 0, NULL, num_samples, ts.samples, result, TSK_STAT_BRANCH); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NUM_WINDOWS); ret = tsk_treeseq_genetic_relatedness_vector( &ts, 1, weights, 2, windows, num_samples, ts.samples, result, TSK_STAT_BRANCH); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); windows[0] = -1; ret = tsk_treeseq_genetic_relatedness_vector( &ts, 1, weights, 2, windows, num_samples, ts.samples, result, TSK_STAT_BRANCH); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); windows[0] = 12; ret = tsk_treeseq_genetic_relatedness_vector( &ts, 1, weights, 2, windows, num_samples, ts.samples, result, TSK_STAT_BRANCH); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); windows[0] = 0; windows[2] = 12; ret = tsk_treeseq_genetic_relatedness_vector( &ts, 1, weights, 2, windows, num_samples, ts.samples, result, TSK_STAT_BRANCH); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); /* unsupported mode errors */ windows[0] = 0.0; windows[1] = 5.0; windows[2] = 10.0; ret = tsk_treeseq_genetic_relatedness_vector(&ts, num_weights, weights, 2, windows, num_samples, ts.samples, result, TSK_STAT_SITE); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE); ret = tsk_treeseq_genetic_relatedness_vector(&ts, num_weights, weights, 2, windows, num_samples, ts.samples, result, TSK_STAT_NODE); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE); tsk_treeseq_free(&ts); free(weights); free(result); } static void test_paper_ex_genetic_relatedness_vector_node_errors(void) { int ret; tsk_treeseq_t ts; tsk_size_t num_samples; double *weights, *result; tsk_size_t j; tsk_size_t num_weights = 2; tsk_size_t num_windows = 2; double windows[] = { 1, 1.5, 2 }; tsk_size_t num_nodes = 3; const tsk_id_t good_nodes[] = { 1, 0, 2 }; const tsk_id_t bad_nodes1[] = { 1, -1, 2 }; const tsk_id_t bad_nodes2[] = { 1, 100, 2 }; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); num_samples = tsk_treeseq_get_num_samples(&ts); weights = tsk_malloc(num_weights * num_samples * sizeof(double)); result = tsk_malloc(num_windows * num_weights * num_nodes * sizeof(double)); for (j = 0; j < num_samples; j++) { weights[j] = 1.0; } for (j = 0; j < num_samples; j++) { weights[j + num_samples] = (float) j; } /* node errors */ ret = tsk_treeseq_genetic_relatedness_vector(&ts, num_weights, weights, 2, windows, num_nodes, good_nodes, result, TSK_STAT_BRANCH); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_genetic_relatedness_vector(&ts, num_weights, weights, 2, windows, num_nodes, bad_nodes1, result, TSK_STAT_BRANCH); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_treeseq_genetic_relatedness_vector(&ts, num_weights, weights, 2, windows, num_nodes, bad_nodes2, result, TSK_STAT_BRANCH); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); free(weights); free(result); } static void test_paper_ex_Y2_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_two_way_stat_func_errors(&ts, tsk_treeseq_Y2, 0); tsk_treeseq_free(&ts); } static void test_paper_ex_Y2(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 2, 2 }; tsk_id_t set_indexes[] = { 0, 1 }; double result; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_Y2(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); /* sample_set_size of 1 leads to NaN */ sample_set_sizes[1] = 1; ret = tsk_treeseq_Y2(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT(tsk_isnan(result)); tsk_treeseq_free(&ts); } static void test_paper_ex_f2_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_two_way_stat_func_errors(&ts, tsk_treeseq_f2, 0); tsk_treeseq_free(&ts); } static void test_paper_ex_f2(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 2, 2 }; tsk_id_t set_indexes[] = { 0, 1 }; double result; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_f2(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); /* sample_set_size of 1 leads to NaN */ sample_set_sizes[0] = 1; ret = tsk_treeseq_f2(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT(tsk_isnan(result)); /* sample_set_size of 1 leads to NaN */ sample_set_sizes[0] = 2; sample_set_sizes[1] = 1; ret = tsk_treeseq_f2(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT(tsk_isnan(result)); tsk_treeseq_free(&ts); } static void test_paper_ex_Y3_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_three_way_stat_func_errors(&ts, tsk_treeseq_Y3); tsk_treeseq_free(&ts); } static void test_paper_ex_Y3(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 2, 1, 1 }; tsk_id_t set_indexes[] = { 0, 1, 2 }; double result; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_Y3(&ts, 3, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); } static void test_paper_ex_f3_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_three_way_stat_func_errors(&ts, tsk_treeseq_f3); tsk_treeseq_free(&ts); } static void test_paper_ex_f3(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 2, 1, 1 }; tsk_id_t set_indexes[] = { 0, 1, 2 }; double result; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_f3(&ts, 3, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); /* sample_set_size of 1 leads to NaN */ sample_set_sizes[0] = 1; ret = tsk_treeseq_f3(&ts, 3, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT(tsk_isnan(result)); tsk_treeseq_free(&ts); } static void test_paper_ex_f4_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_four_way_stat_func_errors(&ts, tsk_treeseq_f4); tsk_treeseq_free(&ts); } static void test_paper_ex_f4(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 1, 1, 1, 1 }; tsk_id_t set_indexes[] = { 0, 1, 2, 3 }; double result; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_f4(&ts, 4, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); } static void test_paper_ex_afs_errors(void) { tsk_treeseq_t ts; tsk_size_t sample_set_sizes[] = { 2, 2 }; tsk_id_t samples[] = { 0, 1, 2, 3 }; double result[10]; /* not thinking too hard about the actual value needed */ double time_windows[] = { 0, 1 }; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_one_way_stat_func_errors_tw(&ts, tsk_treeseq_allele_frequency_spectrum); ret = tsk_treeseq_allele_frequency_spectrum( &ts, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_NODE, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE); ret = tsk_treeseq_allele_frequency_spectrum(&ts, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES); ret = tsk_treeseq_allele_frequency_spectrum(&ts, 2, sample_set_sizes, samples, 0, NULL, 1, time_windows, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE); tsk_treeseq_free(&ts); } static void test_paper_ex_afs(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sample_set_sizes[] = { 4, 0 }; double result[25]; int ret; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); /* we have two singletons and one tripleton */ ret = tsk_treeseq_allele_frequency_spectrum( &ts, 1, sample_set_sizes, samples, 0, NULL, 0, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result[0], 0); CU_ASSERT_EQUAL_FATAL(result[1], 3.0); CU_ASSERT_EQUAL_FATAL(result[2], 0); ret = tsk_treeseq_allele_frequency_spectrum( &ts, 1, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_POLARISED, result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result[0], 0); CU_ASSERT_EQUAL_FATAL(result[1], 2.0); CU_ASSERT_EQUAL_FATAL(result[2], 0); CU_ASSERT_EQUAL_FATAL(result[3], 1.0); CU_ASSERT_EQUAL_FATAL(result[4], 0); verify_afs(&ts); tsk_treeseq_free(&ts); } static void test_paper_ex_divergence_matrix(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_divergence_matrix(&ts, TSK_STAT_BRANCH); verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE); verify_divergence_matrix(&ts, TSK_STAT_SITE); verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE); tsk_treeseq_free(&ts); } static void test_unary_ex_afs(void) { tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 2, 3 }; tsk_size_t sample_set_sizes[] = { 3, 0 }; double result[25]; int ret; tsk_treeseq_from_text(&ts, 100, unary_ex_nodes, unary_ex_edges, NULL, unary_ex_sites, unary_ex_mutations, NULL, NULL, 0); /* we have a singleton and a doubleton */ ret = tsk_treeseq_allele_frequency_spectrum( &ts, 1, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_POLARISED, result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result[0], 0); CU_ASSERT_EQUAL_FATAL(result[1], 1.0); CU_ASSERT_EQUAL_FATAL(result[2], 1.0); CU_ASSERT_EQUAL_FATAL(result[3], 0.0); ret = tsk_treeseq_allele_frequency_spectrum(&ts, 1, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_POLARISED, result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE_FATAL(result[0] > 0); CU_ASSERT_TRUE_FATAL(result[1] > 0); CU_ASSERT_TRUE_FATAL(result[2] > 0); CU_ASSERT_EQUAL_FATAL(result[3], 0.0); verify_afs(&ts); tsk_treeseq_free(&ts); } static void test_nonbinary_ex_ld(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0); verify_ld(&ts); tsk_treeseq_free(&ts); } static void test_nonbinary_ex_mean_descendants(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0); verify_mean_descendants(&ts); tsk_treeseq_free(&ts); } static void test_nonbinary_ex_genealogical_nearest_neighbours(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0); verify_genealogical_nearest_neighbours(&ts); tsk_treeseq_free(&ts); } static void test_nonbinary_ex_general_stat(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0); verify_branch_general_stat_identity(&ts); verify_default_general_stat(&ts); verify_general_stat(&ts, TSK_STAT_BRANCH); verify_general_stat(&ts, TSK_STAT_SITE); verify_general_stat(&ts, TSK_STAT_NODE); tsk_treeseq_free(&ts); } static void test_nonbinary_ex_general_stat_errors(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0); verify_branch_general_stat_errors(&ts); verify_site_general_stat_errors(&ts); verify_node_general_stat_errors(&ts); tsk_treeseq_free(&ts); } static void test_caterpillar_tree_ld(void) { tsk_treeseq_t *ts = caterpillar_tree(50, 20, 1); tsk_ld_calc_t ld_calc; double r2[20]; tsk_size_t num_r2_values; int ret = tsk_ld_calc_init(&ld_calc, ts); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_ld(ts); ret = tsk_ld_calc_get_r2_array( &ld_calc, 0, TSK_DIR_FORWARD, 5, DBL_MAX, r2, &num_r2_values); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_r2_values, 5); ret = tsk_ld_calc_get_r2_array( &ld_calc, 10, TSK_DIR_REVERSE, 5, DBL_MAX, r2, &num_r2_values); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_r2_values, 5); tsk_ld_calc_free(&ld_calc); tsk_treeseq_free(ts); free(ts); } static void test_ld_multi_mutations(void) { tsk_treeseq_t *ts = caterpillar_tree(4, 2, 2); tsk_ld_calc_t ld_calc; double r2; int ret = tsk_ld_calc_init(&ld_calc, ts); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ld_calc_get_r2(&ld_calc, 0, 1, &r2); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_ONLY_INFINITE_SITES); tsk_ld_calc_free(&ld_calc); tsk_treeseq_free(ts); free(ts); } static void test_ld_silent_mutations(void) { tsk_treeseq_t *base_ts = caterpillar_tree(4, 2, 1); tsk_table_collection_t tables; tsk_treeseq_t ts; tsk_ld_calc_t ld_calc; double r2; int ret = tsk_table_collection_copy(base_ts->tables, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.mutations.derived_state[1] = '0'; ret = tsk_treeseq_init(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ld_calc_init(&ld_calc, &ts); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_ld_calc_get_r2(&ld_calc, 0, 1, &r2); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED); tsk_ld_calc_free(&ld_calc); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); tsk_treeseq_free(base_ts); free(base_ts); } static void test_paper_ex_two_site(void) { tsk_treeseq_t ts; double result[27]; tsk_size_t s, result_size, num_sample_sets; int ret; double truth_one_set[9] = { 1, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 1, 1, 0.1111111111111111, 1, 1 }; double truth_two_sets[18] = { 1, 1, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 1, 1, 1, 1, 0.1111111111111111, 0.1111111111111111, 1, 1, 1, 1 }; double truth_three_sets[27] = { 1, 1, NAN, 0.1111111111111111, 0.1111111111111111, NAN, 0.1111111111111111, 0.1111111111111111, NAN, 0.1111111111111111, 0.1111111111111111, NAN, 1, 1, 1, 1, 1, 1, 0.1111111111111111, 0.1111111111111111, NAN, 1, 1, 1, 1, 1, 1 }; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); double truth_three_index_tuples[27] = { 1, 1, NAN, 0.1111111111111111, 0.1111111111111111, NAN, 0.1111111111111111, 0.1111111111111111, NAN, 0.1111111111111111, 0.1111111111111111, NAN, 1, 1, 1, 1, 1, 1, 0.1111111111111111, 0.1111111111111111, NAN, 1, 1, 1, 1, 1, 1 }; tsk_size_t sample_set_sizes[3], num_index_tuples; tsk_id_t sample_sets[ts.num_samples * 3], index_tuples[2 * 3] = { 0, 1, 0, 0, 0, 2 }; tsk_size_t num_sites = ts.tables->sites.num_rows; tsk_id_t *sites = tsk_malloc(num_sites * sizeof(*sites)); // First sample set contains all of the samples sample_set_sizes[0] = ts.num_samples; num_sample_sets = 1; for (s = 0; s < ts.num_samples; s++) { sample_sets[s] = (tsk_id_t) s; } for (s = 0; s < num_sites; s++) { sites[s] = (tsk_id_t) s; } result_size = num_sites * num_sites; tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size * num_sample_sets, result, truth_one_set); // Second sample set contains all of the samples sample_set_sizes[1] = ts.num_samples; num_sample_sets = 2; for (s = ts.num_samples; s < ts.num_samples * 2; s++) { sample_sets[s] = (tsk_id_t) s - (tsk_id_t) ts.num_samples; } tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size * num_sample_sets, result, truth_two_sets); // Third sample set contains the first two samples sample_set_sizes[2] = 2; num_sample_sets = 3; for (s = ts.num_samples * 2; s < (ts.num_samples * 3) - 2; s++) { sample_sets[s] = (tsk_id_t) s - (tsk_id_t) ts.num_samples * 2; } tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal_nan( result_size * num_sample_sets, result, truth_three_sets); // Two-way stats: we'll reuse all sample sets from the first 3 tests num_sample_sets = 3; num_index_tuples = 1; // We'll compute r2 between sample set 0 and sample set 1 tsk_memset(result, 0, sizeof(*result) * result_size * num_index_tuples); ret = tsk_treeseq_r2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size * num_index_tuples, result, truth_one_set); // Compare sample sets [(0, 1), (0, 0)] num_index_tuples = 2; tsk_memset(result, 0, sizeof(*result) * result_size * num_index_tuples); ret = tsk_treeseq_r2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size * num_index_tuples, result, truth_two_sets); // Compare sample sets [(0, 1), (0, 0), (0, 2)] num_index_tuples = 3; tsk_memset(result, 0, sizeof(*result) * result_size * num_index_tuples); ret = tsk_treeseq_r2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal_nan( result_size * num_index_tuples, result, truth_three_index_tuples); tsk_treeseq_free(&ts); tsk_safe_free(sites); } static void test_paper_ex_two_branch(void) { int ret; tsk_treeseq_t ts; double result[27]; tsk_size_t i, result_size, num_sample_sets; tsk_flags_t options = 0; double truth_one_set[9] = { 0.008890640625, 0.004624203125, 0.005215703125, 0.004624203125, 0.003737578125, 0.004377078125, 0.005215703125, 0.004377078124999999, 0.005160578124999998 }; double truth_two_sets[18] = { 0.008890640625, 0.008890640625, 0.004624203125, 0.004624203125, 0.005215703125, 0.005215703125, 0.004624203125, 0.004624203125, 0.003737578125, 0.003737578125, 0.004377078125, 0.004377078125, 0.005215703125, 0.005215703125, 0.004377078124999999, 0.004377078124999999, 0.005160578124999998, 0.005160578124999998 }; double truth_three_sets[27] = { 0.008890640625, 0.008890640625, 0.007225, 0.004624203125000001, 0.004624203125, 0.007225, 0.005215703125000002, 0.005215703125, 0.008585, 0.004624203125, 0.004624203125, 0.007225, 0.003737578125, 0.003737578125, 0.007225, 0.004377078125, 0.004377078125, 0.008585, 0.005215703125, 0.005215703125, 0.008585, 0.004377078124999999, 0.004377078124999999, 0.008585, 0.005160578124999998, 0.005160578124999998, 0.010201 }; double truth_positions_subset_1[12] = { 0.008890640625, 0.008890640625, 0.007225, 0.008890640625, 0.008890640625, 0.007225, 0.008890640625, 0.008890640625, 0.007225, 0.008890640625, 0.008890640625, 0.007225 }; double truth_positions_subset_2[12] = { 0.003737578125, 0.003737578125, 0.007225, 0.003737578125, 0.003737578125, 0.007225, 0.003737578125, 0.003737578125, 0.007225, 0.003737578125, 0.003737578125, 0.007225 }; double truth_positions_subset_3[12] = { 0.005160578125, 0.005160578125, 0.010201, 0.005160578125, 0.005160578125, 0.010201, 0.005160578125, 0.005160578125, 0.010201, 0.005160578125, 0.005160578125, 0.010201 }; double truth_three_index_tuples[27] = { 0.008890640625, 0.008890640625, 0.0039125, 0.004624203125, 0.004624203125, 0.0038125, 0.005215703125, 0.005215703125, 0.0045725, 0.004624203125, 0.004624203125, 0.0038125, 0.003737578125, 0.003737578125, 0.0040125, 0.004377078125, 0.004377078125, 0.0048525, 0.005215703125, 0.005215703125, 0.0045725, 0.004377078125, 0.004377078125, 0.0048525, 0.005160578125, 0.005160578125, 0.0058845 }; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); tsk_size_t sample_set_sizes[3], num_index_tuples; tsk_id_t sample_sets[ts.num_samples * 3], index_tuples[2 * 3] = { 0, 1, 0, 0, 0, 2 }; tsk_size_t num_trees = ts.num_trees; double *positions = tsk_malloc(num_trees * sizeof(*positions)); double positions_subset_1[2] = { 0., 0.1 }; double positions_subset_2[2] = { 2., 6. }; double positions_subset_3[2] = { 9., 9.999 }; // First sample set contains all of the samples sample_set_sizes[0] = ts.num_samples; num_sample_sets = 1; for (i = 0; i < ts.num_samples; i++) { sample_sets[i] = (tsk_id_t) i; } for (i = 0; i < num_trees; i++) { positions[i] = ts.breakpoints[i]; } options |= TSK_STAT_BRANCH; result_size = num_trees * num_trees * num_sample_sets; tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_trees, NULL, positions, num_trees, NULL, positions, options, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_one_set); // Second sample set contains all of the samples sample_set_sizes[1] = ts.num_samples; num_sample_sets = 2; for (i = ts.num_samples; i < ts.num_samples * 2; i++) { sample_sets[i] = (tsk_id_t) i - (tsk_id_t) ts.num_samples; } result_size = num_trees * num_trees * num_sample_sets; tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_trees, NULL, positions, num_trees, NULL, positions, options, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_two_sets); // Third sample set contains the first two samples sample_set_sizes[2] = 2; num_sample_sets = 3; for (i = ts.num_samples * 2; i < (ts.num_samples * 3) - 2; i++) { sample_sets[i] = (tsk_id_t) i - (tsk_id_t) ts.num_samples * 2; } result_size = num_trees * num_trees * num_sample_sets; tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_trees, NULL, positions, num_trees, NULL, positions, options, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal_nan(result_size, result, truth_three_sets); result_size = 4 * num_sample_sets; tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 2, NULL, positions_subset_1, 2, NULL, positions_subset_1, options, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal_nan(result_size, result, truth_positions_subset_1); result_size = 4 * num_sample_sets; tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 2, NULL, positions_subset_2, 2, NULL, positions_subset_2, options, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal_nan(result_size, result, truth_positions_subset_2); result_size = 4 * num_sample_sets; tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 2, NULL, positions_subset_3, 2, NULL, positions_subset_3, options, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal_nan(result_size, result, truth_positions_subset_3); // Two-way stats: we'll reuse all sample sets from the first 3 tests num_sample_sets = 3; result_size = num_trees * num_trees; num_index_tuples = 1; // We'll compute D2 between sample set 0 and sample set 1 tsk_memset(result, 0, sizeof(*result) * result_size * num_index_tuples); ret = tsk_treeseq_D2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, num_trees, NULL, positions, num_trees, NULL, positions, options, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size * num_index_tuples, result, truth_one_set); // Compare sample sets [(0, 1), (0, 0)] num_index_tuples = 2; tsk_memset(result, 0, sizeof(*result) * result_size * num_index_tuples); ret = tsk_treeseq_D2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, num_trees, NULL, positions, num_trees, NULL, positions, options, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size * num_index_tuples, result, truth_two_sets); // Compare sample sets [(0, 1), (0, 0), (0, 2)] num_index_tuples = 3; tsk_memset(result, 0, sizeof(*result) * result_size * num_index_tuples); ret = tsk_treeseq_D2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, num_trees, NULL, positions, num_trees, NULL, positions, options, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal_nan( result_size * num_index_tuples, result, truth_three_index_tuples); tsk_treeseq_free(&ts); tsk_safe_free(positions); } static void test_two_site_correlated_multiallelic(void) { const char *nodes = "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "0 2 -1\n" "0 4 -1\n" "0 6 -1\n" "0 8 -1\n" "0 10 -1\n" "0 12 -1\n" "0 14 -1\n" "0 16 -1\n"; const char *edges = "0 20 9 0,1\n" "0 20 10 2,9\n" "0 20 11 4,5\n" "0 20 12 6,11\n" "0 20 13 7,8\n" "0 20 14 3,10\n" "0 10 15 12\n" "10 20 15 13\n" "0 10 15 14\n" "10 20 15 14\n" "10 20 16 12\n" "0 10 16 13\n" "0 10 16 15\n" "10 20 16 15\n"; const char *tree_sites = "7 A\n" "13 G\n"; const char *mutations = "0 15 T -1\n" "0 14 G 0\n" "1 15 T -1\n" "1 13 C 2\n"; int ret; tsk_treeseq_t ts; tsk_size_t s, result_size; double truth_D[4] = { 0.043209876543209874, -0.018518518518518517, -0.018518518518518517, 0.05555555555555555 }; double truth_D2[4] = { 0.023844603634269844, 0.02384460363426984, 0.02384460363426984, 0.02384460363426984 }; double truth_r2[4] = { 1, 1, 1, 1 }; double truth_D_prime[4] = { 0, -0.5, -0.5, 0 }; double truth_r[4] = { 0.18377223398316206, -0.12212786219416509, -0.12212786219416509, 0.2609542781331212 }; double truth_Dz[4] = { 0.0033870175616860566, 0.003387017561686057, 0.003387017561686057, 0.003387017561686057 }; double truth_pi2[4] = { 0.04579247743399549, 0.04579247743399549, 0.04579247743399549, 0.0457924774339955 }; double truth_D2_unbiased[4] = { 0.026455026455026454, 0.026455026455026454, 0.026455026455026454, 0.026455026455026454 }; double truth_Dz_unbiased[4] = { -0.008818342151675485, -0.008818342151675485, -0.008818342151675485, -0.008818342151675485 }; double truth_pi2_unbiased[4] = { 0.0582010582010582, 0.0582010582010582, 0.0582010582010582, 0.0582010582010582 }; double truth_D2_unbiased_disjoint[4] = { 0.007407407407407407, 0.007407407407407407, 0.007407407407407407, 0.007407407407407407 }; tsk_treeseq_from_text( &ts, 20, nodes, edges, NULL, tree_sites, mutations, NULL, NULL, 0); tsk_size_t num_sample_sets = 1; tsk_size_t sample_set_sizes[2] = { ts.num_samples, ts.num_samples }; tsk_id_t sample_sets[ts.num_samples * 2]; tsk_size_t num_sites = ts.tables->sites.num_rows; tsk_id_t *sites = tsk_malloc(num_sites * sizeof(*sites)); result_size = num_sites * num_sites; double result[result_size]; // Two sample sets for multipop at the bottom, only presenting one to single pop // results for (s = 0; s < ts.num_samples; s++) { sample_sets[s] = (tsk_id_t) s; sample_sets[s + ts.num_samples] = (tsk_id_t) s; } for (s = 0; s < num_sites; s++) { sites[s] = (tsk_id_t) s; } tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_D(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_r2); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_D_prime(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D_prime); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_r(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_r); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_Dz(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_Dz); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_pi2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_pi2); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_D2_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2_unbiased); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_Dz_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_Dz_unbiased); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_pi2_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_pi2_unbiased); // We'll compute r2 between sample set 0 and sample set 1 num_sample_sets = 2; tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_r2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, 1, (tsk_id_t[2]) { 0, 0 }, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_r2); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, 1, (tsk_id_t[2]) { 0, 0 }, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2); // perfectly overlapping sample sets will produce a result equal to the single // population case tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2_ij_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, 1, (tsk_id_t[2]) { 0, 0 }, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2_unbiased); // two disjoint sample sets with 5 and 4 samples {0,1,2,3,4}{5,6,7,8} sample_set_sizes[0] = 5; sample_set_sizes[1] = 4; tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2_ij_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, 1, (tsk_id_t[2]) { 0, 1 }, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2_unbiased_disjoint); tsk_treeseq_free(&ts); tsk_safe_free(sites); } static void test_two_site_uncorrelated_multiallelic(void) { const char *nodes = "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "0 2 -1\n" "0 4 -1\n" "0 6 -1\n" "0 8 -1\n" "0 10 -1\n" "0 12 -1\n" "0 14 -1\n" "0 16 -1\n" "0 2 -1\n" "0 4 -1\n" "0 6 -1\n" "0 8 -1\n" "0 10 -1\n" "0 12 -1\n" "0 14 -1\n" "0 16 -1\n"; const char *edges = "0 10 9 0,1\n" "10 20 17 0,3\n" "0 10 10 2,9\n" "10 20 18 6,17\n" "0 10 11 3,4\n" "10 20 19 1,4\n" "0 10 12 5,11\n" "10 20 20 7,19\n" "0 10 13 6,7\n" "10 20 21 2,5\n" "0 10 14 8,13\n" "10 20 22 8,21\n" "0 10 15 10,12\n" "10 20 23 18,20\n" "0 10 16 14,15\n" "10 20 24 22,23\n"; const char *tree_sites = "7 A\n" "13 G\n"; const char *mutations = "0 15 T -1\n" "0 12 G 0\n" "1 23 T -1\n" "1 20 A 2\n"; tsk_treeseq_t ts; int ret; double truth_D[4] = { 0.05555555555555555, 0.0, 0.0, 0.05555555555555555 }; double truth_D2[4] = { 0.024691358024691357, 0.0, 0.0, 0.024691358024691357 }; double truth_r2[4] = { 1, 0, 0, 1 }; double truth_D_prime[4] = { 0.0, 0.0, 0.0, 0.0 }; double truth_r[4] = { 0.25, 0.0, 0.0, 0.25 }; double truth_Dz[4] = { 0.0, 0.0, 0.0, 0.0 }; double truth_pi2[4] = { 0.04938271604938272, 0.04938271604938272, 0.04938271604938272, 0.04938271604938272 }; double truth_D2_unbiased[4] = { 0.027777777777777776, -0.009259259259259259, -0.009259259259259259, 0.027777777777777776 }; double truth_Dz_unbiased[4] = { -0.015873015873015872, 0.005291005291005289, 0.005291005291005289, -0.015873015873015872 }; double truth_pi2_unbiased[4] = { 0.06349206349206349, 0.06216931216931215, 0.06216931216931215, 0.06349206349206349 }; double truth_D2_unbiased_disjoint[4] = { 0.008333333333333333, -0.0027777777777777775, -0.0027777777777777775, 0.03518518518518518 }; tsk_treeseq_from_text( &ts, 20, nodes, edges, NULL, tree_sites, mutations, NULL, NULL, 0); tsk_size_t s; tsk_size_t num_sample_sets = 1; tsk_size_t num_sites = ts.tables->sites.num_rows; tsk_id_t *sites = tsk_malloc(num_sites * sizeof(*sites)); tsk_size_t sample_set_sizes[2] = { ts.num_samples, ts.num_samples }; tsk_id_t sample_sets[ts.num_samples * 2]; tsk_size_t result_size = num_sites * num_sites; double result[result_size]; // Two sample sets for multipop at the bottom, only presenting one to single pop // results for (s = 0; s < ts.num_samples; s++) { sample_sets[s] = (tsk_id_t) s; sample_sets[s + ts.num_samples] = (tsk_id_t) s; } for (s = 0; s < num_sites; s++) { sites[s] = (tsk_id_t) s; } tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_D(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_D2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_r2); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_D_prime(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D_prime); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_r(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_r); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_Dz(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_Dz); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_pi2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_pi2); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_D2_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2_unbiased); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_Dz_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_Dz_unbiased); tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_pi2_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_pi2_unbiased); // We'll compute r2 between sample set 0 and sample set 1 num_sample_sets = 2; tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_r2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, 1, (tsk_id_t[2]) { 0, 0 }, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_r2); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, 1, (tsk_id_t[2]) { 0, 0 }, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2); // perfectly overlapping sample sets will produce a result equal to the single // population case tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2_ij_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, 1, (tsk_id_t[2]) { 0, 0 }, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2_unbiased); // two disjoint sample sets with 5 and 4 samples {0,1,2,3,4}{5,6,7,8} sample_set_sizes[0] = 5; sample_set_sizes[1] = 4; tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2_ij_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, 1, (tsk_id_t[2]) { 0, 1 }, num_sites, sites, NULL, num_sites, sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2_unbiased_disjoint); tsk_treeseq_free(&ts); tsk_safe_free(sites); } static void test_two_site_backmutation(void) { const char *nodes = "1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n" "1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n" "1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n" "1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n" "1 0 -1\n1 0 -1\n1 0 -1\n0 2 -1\n0 4 -1\n0 6 -1\n0 8 -1\n0 10 -1\n" "0 12 -1\n0 14 -1\n0 16 -1\n0 18 -1\n0 20 -1\n0 22 -1\n0 24 -1\n0 26 -1\n" "0 28 -1\n0 30 -1\n0 32 -1\n0 34 -1\n0 36 -1\n0 38 -1\n0 40 -1\n0 42 -1\n" "0 44 -1\n0 46 -1\n0 48 -1\n0 50 -1\n0 52 -1\n0 54 -1\n0 56 -1\n0 58 -1\n" "0 60 -1\n0 62 -1\n0 64 -1\n0 66 -1\n0 68 -1\n"; const char *edges = "0 10 35 0,1\n0 10 36 2,35\n0 10 37 3,36\n0 10 38 4,37\n0 10 39 5,38\n" "0 10 40 6,39\n0 10 41 7,40\n0 10 42 8,41\n0 10 43 9,42\n0 10 44 10,43\n" "0 10 45 11,44\n0 10 46 12,45\n0 10 47 13,46\n0 10 48 14,47\n0 10 49 15,48\n" "0 10 50 16,49\n0 10 51 17,50\n0 10 52 18,51\n0 10 53 19,52\n0 10 54 20,53\n" "0 10 55 21,54\n0 10 56 22,55\n0 10 57 23,56\n0 10 58 24,57\n0 10 59 25,58\n" "0 10 60 26,59\n0 10 61 27,60\n0 10 62 28,61\n0 10 63 29,62\n0 10 64 30,63\n" "0 10 65 31,64\n0 10 66 32,65\n0 10 67 33,66\n0 10 68 34,67\n"; const char *sites = "1 A\n" "4.5 T\n"; const char *mutations = "0 50 T -1\n" "0 48 G 0\n" "0 46 A 1\n" "1 62 G -1\n" "1 60 T 3\n" "1 58 A 4\n"; int ret; tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); tsk_size_t num_sample_sets = 1; tsk_size_t num_sites = ts.tables->sites.num_rows; tsk_id_t *row_sites = tsk_malloc(num_sites * sizeof(*row_sites)); tsk_id_t *col_sites = tsk_malloc(num_sites * sizeof(*col_sites)); tsk_size_t sample_set_sizes[1] = { ts.num_samples }; tsk_id_t sample_sets[ts.num_samples]; tsk_size_t result_size = num_sites * num_sites; double result[result_size]; tsk_size_t s; double truth_r2[4] = { 0.999999999999999, 0.042923862278701, 0.042923862278701, 1. }; for (s = 0; s < ts.num_samples; s++) { sample_sets[s] = (tsk_id_t) s; } for (s = 0; s < num_sites; s++) { row_sites[s] = (tsk_id_t) s; col_sites[s] = (tsk_id_t) s; } tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_r2); tsk_treeseq_free(&ts); tsk_safe_free(row_sites); tsk_safe_free(col_sites); } static void test_two_locus_branch_all_stats(void) { int ret; tsk_treeseq_t ts; double result[16]; tsk_size_t result_size = 16; tsk_id_t sample_sets[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; tsk_size_t sample_set_sizes[1] = { 10 }; double positions[4] = { 0.0, 2.0, 5.0, 6.0 }; const char *nodes = "1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n1 0 -1\n" "1 0 -1\n1 0 -1\n0 0.02 -1\n0 0.06 -1\n0 0.08 -1\n0 0.09 -1\n0 0.21 -1\n" "0 0.35 -1\n0 0.44 -1\n0 0.69 -1\n0 0.79 -1\n0 0.80 -1\n0 0.84 -1\n" "0 1.26 -1\n"; const char *edges = "0 10 10 0,8\n0 10 11 4,7\n0 10 12 3,9\n0 10 13 6,11\n0 10 14 1,2\n" "5 10 15 5,10\n0 5 16 5,10\n6 10 17 12,14\n2 6 18 14\n5 10 18 15\n" "2 5 18 16\n6 10 18 17\n0 6 19 12\n0 2 19 14\n2 6 19 18\n" "0 2 20 13\n0 2 20 16\n2 10 21 13\n6 10 21 18\n0 6 21 19\n" "0 2 21 20\n"; double truth_D[16] = { 0 }; double truth_D2[16] = { 0.21949755999999998, 0.1867003599999999, 0.18798699999999988, 0.18941379999999983, 0.18670035999999995, 0.21159555999999993, 0.21257979999999996, 0.21222580000000005, 0.187987, 0.21257979999999996, 0.21380379999999996, 0.2134714, 0.18941379999999994, 0.21222579999999996, 0.21347139999999992, 0.21377299999999996 }; double truth_r2[16] = { 6.286870108969513, 5.742220038107836, 5.7080225607835695, 5.623290389581752, 5.742220038107832, 6.3274209876543175, 6.291288603867465, 6.195658345930953, 5.708022560783573, 6.291288603867472, 6.266256220080618, 6.170677280171318, 5.623290389581758, 6.195658345930966, 6.170677280171324, 6.094109054547737 }; double truth_D_prime[16] = { -9.6552, -9.44459999999999, -9.136799999999988, -8.680999999999989, -9.444599999999998, -9.240699999999984, -8.937399999999977, -8.488499999999984, -9.136799999999996, -8.93739999999999, -8.658399999999984, -8.219399999999993, -8.68099999999999, -8.488499999999991, -8.21939999999999, -7.814699999999995 }; double truth_r[16] = { 0.023193673439522472, 0.023272634599981495, 0.021243465874728862, 0.01919099466703808, 0.023272634599981454, 0.023358527073393587, 0.021370047752011, 0.019268461077492888, 0.021243465874728862, 0.021370047752011012, 0.020359977803327087, 0.01793842604857987, 0.019190994667037817, 0.019268461077492804, 0.017938426048579773, 0.0160605735196305 }; double truth_Dz[16] = { 0.01958895999999996, -0.007941440000000037, -0.007572800000000046, -0.010558400000000029, -0.007941440000000022, 0.01385535999999997, 0.014569599999999966, 0.015529599999999963, -0.007572800000000024, 0.01456959999999996, 0.015426399999999951, 0.016271199999999948, -0.010558400000000011, 0.01552959999999999, 0.016271199999999986, 0.017607999999999985 }; double truth_pi2[16] = { 0.7201219600000001, 0.6895723600000001, 0.6865174000000006, 0.6780314000000008, 0.6895723600000002, 0.6603187600000002, 0.6573934000000002, 0.6492674000000002, 0.6865174000000002, 0.6573934000000003, 0.6544810000000003, 0.6463910000000003, 0.6780314000000002, 0.6492674000000004, 0.6463910000000005, 0.6384010000000007 }; double truth_Dz_unbiased[16] = { -0.06387380952380949, -0.09312571428571428, -0.09361428571428566, -0.10075682539682536, -0.09312571428571428, -0.0734419047619048, -0.0730733333333334, -0.07171301587301597, -0.0936142857142857, -0.07307333333333343, -0.07261476190476202, -0.07147730158730167, -0.10075682539682543, -0.07171301587301596, -0.07147730158730159, -0.06988666666666674 }; double truth_D2_unbiased[16] = { 0.19576484126984134, 0.1586769841269842, 0.16093412698412704, 0.16485253968253985, 0.15867698412698414, 0.1949926984126984, 0.19673555555555555, 0.19734825396825403, 0.16093412698412699, 0.1967355555555555, 0.19879341269841264, 0.19945182539682532, 0.16485253968253968, 0.19734825396825395, 0.1994518253968253, 0.20091222222222213 }; double truth_pi2_unbiased[16] = { 0.8910765079365083, 0.8571103174603181, 0.853337460317461, 0.8434880952380959, 0.8571103174603178, 0.8182193650793657, 0.8145322222222225, 0.8043504761904768, 0.8533374603174609, 0.8145322222222225, 0.8108450793650795, 0.800729047619048, 0.8434880952380955, 0.8043504761904766, 0.8007290476190477, 0.7906733333333332 }; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D(&ts, 1, sample_set_sizes, sample_sets, 4, NULL, positions, 4, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2(&ts, 1, sample_set_sizes, sample_sets, 4, NULL, positions, 4, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_r2(&ts, 1, sample_set_sizes, sample_sets, 4, NULL, positions, 4, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_r2); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D_prime(&ts, 1, sample_set_sizes, sample_sets, 4, NULL, positions, 4, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D_prime); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_r(&ts, 1, sample_set_sizes, sample_sets, 4, NULL, positions, 4, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_r); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_Dz(&ts, 1, sample_set_sizes, sample_sets, 4, NULL, positions, 4, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_Dz); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_pi2(&ts, 1, sample_set_sizes, sample_sets, 4, NULL, positions, 4, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_pi2); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_Dz_unbiased(&ts, 1, sample_set_sizes, sample_sets, 4, NULL, positions, 4, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_Dz_unbiased); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_D2_unbiased(&ts, 1, sample_set_sizes, sample_sets, 4, NULL, positions, 4, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_D2_unbiased); tsk_memset(result, 0, sizeof(*result) * result_size); ret = tsk_treeseq_pi2_unbiased(&ts, 1, sample_set_sizes, sample_sets, 4, NULL, positions, 4, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size, result, truth_pi2_unbiased); tsk_treeseq_free(&ts); } static void test_paper_ex_two_site_subset(void) { tsk_treeseq_t ts; double result[4]; int ret; tsk_size_t s, result_size; tsk_size_t sample_set_sizes[1]; tsk_size_t num_sample_sets; tsk_id_t row_sites[2] = { 0, 1 }; tsk_id_t col_sites[2] = { 1, 2 }; double result_truth_1[4] = { 0.1111111111111111, 0.1111111111111111, 1, 1 }; double result_truth_2[1] = { 0.1111111111111111 }; double result_truth_3[4] = { 0.1111111111111111, 1, 0.1111111111111111, 1 }; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); tsk_id_t sample_sets[ts.num_samples]; sample_set_sizes[0] = ts.num_samples; num_sample_sets = 1; for (s = 0; s < ts.num_samples; s++) { sample_sets[s] = (tsk_id_t) s; } result_size = 2 * 2; tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 2, row_sites, NULL, 2, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size * num_sample_sets, result, result_truth_1); result_size = 1 * 1; tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); col_sites[0] = 2; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 1, row_sites, NULL, 1, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size * num_sample_sets, result, result_truth_2); result_size = 2 * 2; tsk_memset(result, 0, sizeof(*result) * result_size * num_sample_sets); row_sites[0] = 1; row_sites[1] = 2; col_sites[0] = 0; col_sites[1] = 1; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 2, row_sites, NULL, 2, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(result_size * num_sample_sets, result, result_truth_3); tsk_treeseq_free(&ts); } static void test_two_locus_stat_input_errors(void) { tsk_treeseq_t ts; int ret; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); tsk_size_t num_sites = ts.tables->sites.num_rows; tsk_id_t *row_sites = tsk_malloc(num_sites * sizeof(*row_sites)); tsk_id_t *col_sites = tsk_malloc(num_sites * sizeof(*col_sites)); tsk_size_t sample_set_sizes[2] = { ts.num_samples, ts.num_samples }; tsk_size_t num_sample_sets = 1; tsk_id_t index_tuples[2] = { 0 }; tsk_size_t num_index_tuples = 1; tsk_id_t sample_sets[ts.num_samples * 2]; // need 2 sample sets for multipop double positions[10] = { 0., 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 }; double bad_col_positions[2] = { 0., 0. }; // used in 1 test to cover column check double result[100]; tsk_size_t s; for (s = 0; s < ts.num_samples; s++) { sample_sets[s] = (tsk_id_t) s; sample_sets[s + ts.num_samples] = (tsk_id_t) s; } for (s = 0; s < num_sites; s++) { row_sites[s] = (tsk_id_t) s; col_sites[s] = (tsk_id_t) s; } // begin with the happy path ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_two_locus_count_stat(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, NULL, NULL, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_RESULT_DIMS); ret = tsk_treeseq_r2(&ts, 1, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); sample_sets[1] = 0; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); sample_sets[1] = 1; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, TSK_STAT_SITE | TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES); ret = tsk_treeseq_r2(&ts, 0, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); sample_set_sizes[0] = 0; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EMPTY_SAMPLE_SET); sample_set_sizes[0] = ts.num_samples; sample_sets[1] = 10; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); sample_sets[1] = 1; row_sites[0] = 1000; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); row_sites[0] = 0; col_sites[num_sites - 1] = (tsk_id_t) num_sites; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); col_sites[num_sites - 1] = (tsk_id_t) num_sites - 1; row_sites[0] = 1; row_sites[1] = 0; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_STAT_UNSORTED_SITES); row_sites[0] = 0; row_sites[1] = 1; row_sites[0] = 1; row_sites[1] = 1; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_STAT_DUPLICATE_SITES); row_sites[0] = 0; row_sites[1] = 1; // Not an error condition, but we want to record this behavior. The method is robust // to zero-length site/position inputs. ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, NULL, 0, NULL, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 0, NULL, NULL, 0, NULL, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); positions[9] = 1; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 10, NULL, positions, 10, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POSITION_OUT_OF_BOUNDS); positions[9] = 0.9; positions[0] = -0.1; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 10, NULL, positions, 10, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POSITION_OUT_OF_BOUNDS); positions[0] = 0; positions[0] = 0.1; positions[1] = 0; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 10, NULL, positions, 10, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_STAT_UNSORTED_POSITIONS); positions[0] = 0; positions[1] = 0.1; // rows always fail first, check columns ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 10, NULL, positions, 2, NULL, bad_col_positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_STAT_DUPLICATE_POSITIONS); positions[0] = 0; positions[1] = 0; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 10, NULL, positions, 10, NULL, positions, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_STAT_DUPLICATE_POSITIONS); positions[0] = 0; positions[1] = 0.1; ret = tsk_treeseq_r2(&ts, num_sample_sets, sample_set_sizes, sample_sets, 10, NULL, positions, 10, NULL, positions, TSK_STAT_NODE, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE); num_sample_sets = 2; num_index_tuples = 0; ret = tsk_treeseq_r2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_INDEX_TUPLES); num_sample_sets = 0; num_index_tuples = 1; ret = tsk_treeseq_D2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); num_sample_sets = 2; index_tuples[0] = 2; ret = tsk_treeseq_D2_ij_unbiased(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, num_sites, row_sites, NULL, num_sites, col_sites, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLE_SET_INDEX); tsk_treeseq_free(&ts); tsk_safe_free(row_sites); tsk_safe_free(col_sites); } static void test_simplest_divergence_matrix(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n"; const char *edges = "0 1 2 0,1\n"; const char *sites = "0.1 A\n" "0.6 A\n"; const char *mutations = "0 0 B -1\n" "1 0 B -1\n"; tsk_treeseq_t ts; tsk_id_t sample_ids[] = { 0, 1 }; double D_branch[4] = { 0, 2, 2, 0 }; double D_site[4] = { 0, 2, 2, 0 }; double result[4]; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_divergence_matrix( &ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(4, D_branch, result); ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(4, D_branch, result); ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(4, D_site, result); ret = tsk_treeseq_divergence_matrix( &ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_SPAN_NORMALISE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(4, D_site, result); ret = tsk_treeseq_divergence_matrix( &ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(4, D_site, result); ret = tsk_treeseq_divergence_matrix( &ts, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(4, D_branch, result); ret = tsk_treeseq_divergence_matrix( &ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(4, D_site, result); ret = tsk_treeseq_divergence_matrix( &ts, 0, NULL, NULL, 0, NULL, TSK_STAT_NODE, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE); ret = tsk_treeseq_divergence_matrix( &ts, 0, NULL, NULL, 0, NULL, TSK_STAT_POLARISED, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_STAT_POLARISED_UNSUPPORTED); ret = tsk_treeseq_divergence_matrix( &ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE | TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES); sample_ids[0] = -1; ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); sample_ids[0] = 3; ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); sample_ids[0] = 1; ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); ret = tsk_treeseq_divergence_matrix( &ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); sample_ids[0] = 2; ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SAMPLES); tsk_treeseq_free(&ts); } static void test_simplest_divergence_matrix_windows(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n"; const char *edges = "0 1 2 0,1\n"; const char *sites = "0.1 A\n" "0.6 A\n"; const char *mutations = "0 0 B -1\n" "1 0 B -1\n"; tsk_treeseq_t ts; tsk_id_t sample_ids[] = { 0, 1 }; double D_branch[8] = { 0, 1, 1, 0, 0, 1, 1, 0 }; double D_site[8] = { 0, 1, 1, 0, 0, 1, 1, 0 }; double result[8]; double windows[] = { 0, 0.5, 1 }; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 2, windows, 0, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(8, D_site, result); ret = tsk_treeseq_divergence_matrix( &ts, 2, NULL, sample_ids, 2, windows, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(8, D_branch, result); /* Windows for the second half */ ret = tsk_treeseq_divergence_matrix( &ts, 2, NULL, sample_ids, 1, windows + 1, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(4, D_site, result); ret = tsk_treeseq_divergence_matrix( &ts, 2, NULL, sample_ids, 1, windows + 1, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(4, D_branch, result); ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, windows, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NUM_WINDOWS); windows[0] = -1; ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 2, windows, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); windows[0] = 0.45; windows[2] = 1.5; ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 2, windows, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); windows[0] = 0.55; windows[2] = 1.0; ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 2, windows, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS); tsk_treeseq_free(&ts); } static void test_simplest_divergence_matrix_internal_sample(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 1 0\n"; const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts; tsk_id_t sample_ids[] = { 0, 1, 2 }; double result[9]; double D_branch[9] = { 0, 2, 1, 2, 0, 1, 1, 1, 0 }; double D_site[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_divergence_matrix( &ts, 3, NULL, sample_ids, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(9, D_branch, result); ret = tsk_treeseq_divergence_matrix( &ts, 3, NULL, sample_ids, 0, NULL, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_arrays_almost_equal(9, D_site, result); tsk_treeseq_free(&ts); } static void test_multiroot_divergence_matrix(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, multiroot_ex_nodes, multiroot_ex_edges, NULL, multiroot_ex_sites, multiroot_ex_mutations, NULL, NULL, 0); verify_divergence_matrix(&ts, TSK_STAT_BRANCH); verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE); verify_divergence_matrix(&ts, TSK_STAT_SITE); verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE); tsk_treeseq_free(&ts); } static void test_pair_coalescence_counts(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0); verify_pair_coalescence_counts(&ts, 0); verify_pair_coalescence_counts(&ts, TSK_STAT_SPAN_NORMALISE); verify_pair_coalescence_counts(&ts, TSK_STAT_PAIR_NORMALISE); verify_pair_coalescence_counts( &ts, TSK_STAT_SPAN_NORMALISE | TSK_STAT_PAIR_NORMALISE); tsk_treeseq_free(&ts); } static void test_pair_coalescence_counts_missing(void) { tsk_treeseq_t ts; tsk_treeseq_from_text( &ts, 5, missing_ex_nodes, missing_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); verify_pair_coalescence_counts(&ts, 0); verify_pair_coalescence_counts(&ts, TSK_STAT_SPAN_NORMALISE); tsk_treeseq_free(&ts); } static void test_pair_coalescence_quantiles(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0); verify_pair_coalescence_quantiles(&ts); tsk_treeseq_free(&ts); } static void test_pair_coalescence_rates(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0); verify_pair_coalescence_rates(&ts); tsk_treeseq_free(&ts); } int main(int argc, char **argv) { CU_TestInfo tests[] = { { "test_general_stat_input_errors", test_general_stat_input_errors }, { "test_empty_ts_ld", test_empty_ts_ld }, { "test_empty_ts_mean_descendants", test_empty_ts_mean_descendants }, { "test_empty_ts_genealogical_nearest_neighbours", test_empty_ts_genealogical_nearest_neighbours }, { "test_empty_ts_general_stat", test_empty_ts_general_stat }, { "test_empty_ts_afs", test_empty_ts_afs }, { "test_single_tree_ld", test_single_tree_ld }, { "test_single_tree_mean_descendants", test_single_tree_mean_descendants }, { "test_single_tree_genealogical_nearest_neighbours", test_single_tree_genealogical_nearest_neighbours }, { "test_single_tree_general_stat", test_single_tree_general_stat }, { "test_single_tree_general_stat_errors", test_single_tree_general_stat_errors }, { "test_single_tree_divergence_matrix", test_single_tree_divergence_matrix }, { "test_single_tree_divergence_matrix_internal_samples", test_single_tree_divergence_matrix_internal_samples }, { "test_single_tree_divergence_matrix_multi_root", test_single_tree_divergence_matrix_multi_root }, { "test_paper_ex_ld", test_paper_ex_ld }, { "test_paper_ex_mean_descendants", test_paper_ex_mean_descendants }, { "test_paper_ex_genealogical_nearest_neighbours", test_paper_ex_genealogical_nearest_neighbours }, { "test_paper_ex_general_stat_errors", test_paper_ex_general_stat_errors }, { "test_paper_ex_general_stat", test_paper_ex_general_stat }, { "test_paper_ex_trait_covariance_errors", test_paper_ex_trait_covariance_errors }, { "test_paper_ex_trait_covariance", test_paper_ex_trait_covariance }, { "test_paper_ex_trait_correlation_errors", test_paper_ex_trait_correlation_errors }, { "test_paper_ex_trait_correlation", test_paper_ex_trait_correlation }, { "test_paper_ex_trait_linear_model_errors", test_paper_ex_trait_linear_model_errors }, { "test_paper_ex_trait_linear_model", test_paper_ex_trait_linear_model }, { "test_paper_ex_diversity_errors", test_paper_ex_diversity_errors }, { "test_paper_ex_diversity", test_paper_ex_diversity }, { "test_paper_ex_segregating_sites_errors", test_paper_ex_segregating_sites_errors }, { "test_paper_ex_segregating_sites", test_paper_ex_segregating_sites }, { "test_paper_ex_Y1_errors", test_paper_ex_Y1_errors }, { "test_paper_ex_Y1", test_paper_ex_Y1 }, { "test_paper_ex_divergence_errors", test_paper_ex_divergence_errors }, { "test_paper_ex_divergence", test_paper_ex_divergence }, { "test_paper_ex_genetic_relatedness_errors", test_paper_ex_genetic_relatedness_errors }, { "test_paper_ex_genetic_relatedness", test_paper_ex_genetic_relatedness }, { "test_paper_ex_genetic_relatedness_weighted", test_paper_ex_genetic_relatedness_weighted }, { "test_paper_ex_genetic_relatedness_weighted_errors", test_paper_ex_genetic_relatedness_weighted_errors }, { "test_empty_genetic_relatedness_vector", test_empty_genetic_relatedness_vector }, { "test_paper_ex_genetic_relatedness_vector", test_paper_ex_genetic_relatedness_vector }, { "test_paper_ex_genetic_relatedness_vector_errors", test_paper_ex_genetic_relatedness_vector_errors }, { "test_paper_ex_genetic_relatedness_vector_node_errors", test_paper_ex_genetic_relatedness_vector_node_errors }, { "test_paper_ex_Y2_errors", test_paper_ex_Y2_errors }, { "test_paper_ex_Y2", test_paper_ex_Y2 }, { "test_paper_ex_f2_errors", test_paper_ex_f2_errors }, { "test_paper_ex_f2", test_paper_ex_f2 }, { "test_paper_ex_Y3_errors", test_paper_ex_Y3_errors }, { "test_paper_ex_Y3", test_paper_ex_Y3 }, { "test_paper_ex_f3_errors", test_paper_ex_f3_errors }, { "test_paper_ex_f3", test_paper_ex_f3 }, { "test_paper_ex_f4_errors", test_paper_ex_f4_errors }, { "test_paper_ex_f4", test_paper_ex_f4 }, { "test_paper_ex_afs_errors", test_paper_ex_afs_errors }, { "test_paper_ex_afs", test_paper_ex_afs }, { "test_paper_ex_divergence_matrix", test_paper_ex_divergence_matrix }, { "test_unary_ex_afs", test_unary_ex_afs }, { "test_nonbinary_ex_ld", test_nonbinary_ex_ld }, { "test_nonbinary_ex_mean_descendants", test_nonbinary_ex_mean_descendants }, { "test_nonbinary_ex_genealogical_nearest_neighbours", test_nonbinary_ex_genealogical_nearest_neighbours }, { "test_nonbinary_ex_general_stat", test_nonbinary_ex_general_stat }, { "test_nonbinary_ex_general_stat_errors", test_nonbinary_ex_general_stat_errors }, { "test_caterpillar_tree_ld", test_caterpillar_tree_ld }, { "test_ld_multi_mutations", test_ld_multi_mutations }, { "test_ld_silent_mutations", test_ld_silent_mutations }, { "test_paper_ex_two_site", test_paper_ex_two_site }, { "test_paper_ex_two_branch", test_paper_ex_two_branch }, { "test_two_site_correlated_multiallelic", test_two_site_correlated_multiallelic }, { "test_two_site_uncorrelated_multiallelic", test_two_site_uncorrelated_multiallelic }, { "test_two_site_backmutation", test_two_site_backmutation }, { "test_two_locus_site_all_stats", test_two_locus_branch_all_stats }, { "test_paper_ex_two_site_subset", test_paper_ex_two_site_subset }, { "test_two_locus_stat_input_errors", test_two_locus_stat_input_errors }, { "test_simplest_divergence_matrix", test_simplest_divergence_matrix }, { "test_simplest_divergence_matrix_windows", test_simplest_divergence_matrix_windows }, { "test_simplest_divergence_matrix_internal_sample", test_simplest_divergence_matrix_internal_sample }, { "test_multiroot_divergence_matrix", test_multiroot_divergence_matrix }, { "test_pair_coalescence_counts", test_pair_coalescence_counts }, { "test_pair_coalescence_counts_missing", test_pair_coalescence_counts_missing }, { "test_pair_coalescence_quantiles", test_pair_coalescence_quantiles }, { "test_pair_coalescence_rates", test_pair_coalescence_rates }, { NULL, NULL }, }; return test_main(tests, argc, argv); } ================================================ FILE: c/tests/test_tables.c ================================================ /* * MIT License * * Copyright (c) 2019-2023 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "testlib.h" #include "tskit/core.h" #include #include #include #include static void reverse_migrations(tsk_table_collection_t *tables) { int ret; tsk_migration_table_t migrations; tsk_migration_t migration; tsk_id_t j, ret_id; /* Easy way to copy the metadata schema */ ret = tsk_migration_table_copy(&tables->migrations, &migrations, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_clear(&migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = (tsk_id_t) tables->migrations.num_rows - 1; j >= 0; j--) { ret = tsk_migration_table_get_row(&tables->migrations, j, &migration); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row(&migrations, migration.left, migration.right, migration.node, migration.source, migration.dest, migration.time, migration.metadata, migration.metadata_length); CU_ASSERT_FATAL(ret_id >= 0); } ret = tsk_migration_table_copy(&migrations, &tables->migrations, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_migration_table_free(&migrations); } static void reverse_edges(tsk_table_collection_t *tables) { int ret; tsk_edge_table_t edges; tsk_edge_t edge; tsk_id_t j, ret_id; /* Easy way to copy the metadata schema */ ret = tsk_edge_table_copy(&tables->edges, &edges, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_clear(&edges); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = (tsk_id_t) tables->edges.num_rows - 1; j >= 0; j--) { ret = tsk_edge_table_get_row(&tables->edges, j, &edge); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&edges, edge.left, edge.right, edge.parent, edge.child, edge.metadata, edge.metadata_length); CU_ASSERT_FATAL(ret_id >= 0); } ret = tsk_edge_table_copy(&edges, &tables->edges, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_edge_table_free(&edges); } static void reverse_mutations(tsk_table_collection_t *tables) { int ret; tsk_mutation_table_t mutations; tsk_mutation_t mutation; tsk_id_t j, ret_id; tsk_id_t new_parent; tsk_id_t n = (tsk_id_t) tables->mutations.num_rows; ret = tsk_mutation_table_init(&mutations, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = n - 1; j >= 0; j--) { ret = tsk_mutation_table_get_row(&tables->mutations, j, &mutation); CU_ASSERT_EQUAL_FATAL(ret, 0); new_parent = (mutation.parent == TSK_NULL) ? TSK_NULL : n - mutation.parent - 1; ret_id = tsk_mutation_table_add_row(&mutations, mutation.site, mutation.node, new_parent, mutation.time, mutation.derived_state, mutation.derived_state_length, mutation.metadata, mutation.metadata_length); CU_ASSERT_FATAL(ret_id >= 0); } ret = tsk_mutation_table_copy(&mutations, &tables->mutations, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_mutation_table_free(&mutations); } static void insert_edge_metadata(tsk_table_collection_t *tables) { int ret; tsk_edge_table_t edges; tsk_edge_t edge; tsk_id_t j, ret_id; char metadata[100]; ret = tsk_edge_table_init(&edges, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) tables->edges.num_rows; j++) { ret = tsk_edge_table_get_row(&tables->edges, j, &edge); CU_ASSERT_EQUAL_FATAL(ret, 0); snprintf(metadata, sizeof(metadata), "md_%lld\n", (long long) j); ret_id = tsk_edge_table_add_row(&edges, edge.left, edge.right, edge.parent, edge.child, metadata, (tsk_size_t) strlen(metadata)); CU_ASSERT_FATAL(ret_id >= 0); } ret = tsk_edge_table_copy(&edges, &tables->edges, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_edge_table_free(&edges); } static void test_table_collection_equals_options(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tc1, tc2; char example_time_units[100] = "An example of time units with unicode ⏰"; char example_metadata[100] = "An example of metadata with unicode 🎄🌳🌴🌲🎋"; char example_metadata_schema[100] = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_time_units_length = (tsk_size_t) strlen(example_time_units); tsk_size_t example_metadata_length = (tsk_size_t) strlen(example_metadata); tsk_size_t example_metadata_schema_length = (tsk_size_t) strlen(example_metadata_schema); // Test equality empty tables ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_init(&tc2, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_equals(&tc1, &tc2, 0); CU_ASSERT_TRUE(ret); // Adding some meat to the tables ret_id = tsk_node_table_add_row(&tc1.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row(&tc1.nodes, TSK_NODE_IS_SAMPLE, 1.0, 0, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_individual_table_add_row(&tc1.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_population_table_add_row(&tc1.populations, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tc1.edges, 0.0, 1.0, 1, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_site_table_add_row(&tc1.sites, 0.2, "A", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tc1.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT(ret_id >= 0); // Equality of empty vs non-empty ret = tsk_table_collection_equals(&tc1, &tc2, 0); CU_ASSERT_FALSE(ret); ret = tsk_table_collection_copy(&tc1, &tc2, TSK_NO_INIT); CU_ASSERT_EQUAL(ret, 0); // Equivalent except for time_units ret = tsk_table_collection_set_metadata( &tc1, example_time_units, example_time_units_length); CU_ASSERT_EQUAL(ret, 0); // Equivalent except for metadata ret = tsk_table_collection_set_metadata( &tc1, example_metadata, example_metadata_length); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_TS_METADATA); CU_ASSERT_TRUE(ret); /* TSK_CMP_IGNORE_METADATA implies TSK_CMP_IGNORE_TS_METADATA */ ret = tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_METADATA); CU_ASSERT_TRUE(ret); ret = tsk_table_collection_equals(&tc1, &tc2, 0); CU_ASSERT_FALSE(ret); ret = tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_PROVENANCE); CU_ASSERT_FALSE(ret); ret = tsk_table_collection_set_metadata( &tc2, example_metadata, example_metadata_length); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_equals(&tc1, &tc2, 0); CU_ASSERT_TRUE(ret); ret = tsk_table_collection_set_metadata_schema( &tc1, example_metadata_schema, example_metadata_schema_length); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_TS_METADATA); CU_ASSERT_TRUE(ret); ret = tsk_table_collection_equals(&tc1, &tc2, 0); CU_ASSERT_FALSE(ret); ret = tsk_table_collection_set_metadata_schema( &tc2, example_metadata_schema, example_metadata_schema_length); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_equals(&tc1, &tc2, 0); CU_ASSERT_TRUE(ret); // Ignore provenance ret_id = tsk_provenance_table_add_row(&tc1.provenances, "time", 4, "record", 6); CU_ASSERT_EQUAL(ret_id, 0); ret = tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_PROVENANCE); CU_ASSERT_TRUE(ret); ret = tsk_table_collection_equals(&tc1, &tc2, 0); CU_ASSERT_FALSE(ret); ret = tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_TS_METADATA); CU_ASSERT_FALSE(ret); ret_id = tsk_provenance_table_add_row(&tc2.provenances, "time", 4, "record", 6); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_PROVENANCE); CU_ASSERT_TRUE(ret); ret = tsk_table_collection_equals(&tc1, &tc2, 0); CU_ASSERT_TRUE(ret); // Ignore provenance timestamp ret_id = tsk_provenance_table_add_row(&tc1.provenances, "time", 4, "record", 6); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_provenance_table_add_row(&tc2.provenances, "other", 5, "record", 6); CU_ASSERT_FATAL(ret_id >= 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_PROVENANCE)); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_TIMESTAMPS)); // Ignore provenance and top-level metadata. ret = tsk_provenance_table_clear(&tc1.provenances); CU_ASSERT_EQUAL(ret, 0); example_metadata[0] = 'J'; ret = tsk_table_collection_set_metadata( &tc1, example_metadata, example_metadata_length); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_equals(&tc1, &tc2, 0); CU_ASSERT_FALSE(ret); ret = tsk_table_collection_equals( &tc1, &tc2, TSK_CMP_IGNORE_TS_METADATA | TSK_CMP_IGNORE_PROVENANCE); CU_ASSERT_TRUE(ret); tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); // Check what happens when one of the tables just differs by metadata. ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_init(&tc2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_population_table_add_row(&tc1.populations, "metadata", 8); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_population_table_add_row(&tc2.populations, "", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_METADATA)); tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); // Ignore tables ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_init(&tc2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_metadata( &tc1, example_metadata, example_metadata_length); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_set_metadata( &tc2, example_metadata, example_metadata_length); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); // Add one row for each table we're ignoring ret_id = tsk_individual_table_add_row(&tc1.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row(&tc1.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tc1.edges, 0.0, 1.0, 1, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_migration_table_add_row(&tc1.migrations, 0, 0, 0, 0, 0, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_site_table_add_row(&tc1.sites, 0.2, "A", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tc1.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_population_table_add_row(&tc1.populations, NULL, 0); CU_ASSERT(ret_id >= 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_TABLES)); tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); // Ignore reference sequence ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_init(&tc2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_metadata( &tc1, example_metadata, example_metadata_length); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_set_metadata( &tc2, example_metadata, example_metadata_length); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_data(&tc1.reference_sequence, "A", 1); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); CU_ASSERT_TRUE( tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_REFERENCE_SEQUENCE)); tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); } static void test_table_collection_simplify_errors(void) { int ret; tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1 }; tsk_id_t ret_id; const char *individuals = "1 0.25 -2\n"; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); /* Bad samples */ samples[0] = -1; ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = 10; ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); samples[0] = 0; /* Duplicate samples */ samples[0] = 0; samples[1] = 0; ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); samples[0] = 0; ret_id = tsk_site_table_add_row(&tables.sites, 0, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_simplify(&tables, samples, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SITE_POSITION); /* Out of order positions */ tables.sites.position[0] = 0.5; ret = tsk_table_collection_simplify(&tables, samples, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_SITES); /* Position out of bounds */ tables.sites.position[0] = 1.5; ret = tsk_table_collection_simplify(&tables, samples, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SITE_POSITION); tsk_site_table_truncate(&tables.sites, 0); tables.sites.position[0] = 0; /* Individual out of bounds */ parse_individuals(individuals, &tables.individuals); CU_ASSERT_EQUAL_FATAL(tables.individuals.num_rows, 1); ret = tsk_table_collection_simplify(&tables, samples, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); /* TODO More tests for this: see * https://github.com/tskit-dev/msprime/issues/517 */ tsk_table_collection_free(&tables); } static void test_reference_sequence_state_machine(void) { tsk_reference_sequence_t r1; tsk_reference_sequence_init(&r1, 0); CU_ASSERT_EQUAL(r1.data, NULL); CU_ASSERT_EQUAL(r1.url, NULL); CU_ASSERT_EQUAL(r1.metadata, NULL); CU_ASSERT_EQUAL(r1.metadata_schema, NULL); CU_ASSERT_TRUE(tsk_reference_sequence_is_null(&r1)); CU_ASSERT_EQUAL(tsk_reference_sequence_set_data(&r1, "x", 1), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); /* Setting the value back to NULL makes the reference whole object NULL */ CU_ASSERT_EQUAL(tsk_reference_sequence_set_data(&r1, NULL, 0), 0); CU_ASSERT_TRUE(tsk_reference_sequence_is_null(&r1)); tsk_reference_sequence_free(&r1); CU_ASSERT_TRUE(tsk_reference_sequence_is_null(&r1)); /* Any empty string is the same thing. */ tsk_reference_sequence_init(&r1, 0); CU_ASSERT_EQUAL(tsk_reference_sequence_set_data(&r1, "", 0), 0); CU_ASSERT_TRUE(tsk_reference_sequence_is_null(&r1)); tsk_reference_sequence_free(&r1); tsk_reference_sequence_init(&r1, 0); CU_ASSERT_EQUAL(tsk_reference_sequence_set_url(&r1, "x", 1), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); tsk_reference_sequence_free(&r1); tsk_reference_sequence_init(&r1, 0); CU_ASSERT_EQUAL(tsk_reference_sequence_set_metadata(&r1, "x", 1), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); tsk_reference_sequence_free(&r1); tsk_reference_sequence_init(&r1, 0); CU_ASSERT_EQUAL(tsk_reference_sequence_set_metadata_schema(&r1, "x", 1), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); tsk_reference_sequence_free(&r1); tsk_reference_sequence_init(&r1, 0); CU_ASSERT_EQUAL(tsk_reference_sequence_set_metadata(&r1, "x", 1), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); CU_ASSERT_EQUAL(tsk_reference_sequence_set_metadata_schema(&r1, "x", 1), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); CU_ASSERT_EQUAL(tsk_reference_sequence_set_url(&r1, "x", 1), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); CU_ASSERT_EQUAL(tsk_reference_sequence_set_data(&r1, "x", 1), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); CU_ASSERT_EQUAL(tsk_reference_sequence_set_metadata(&r1, "", 0), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); CU_ASSERT_EQUAL(tsk_reference_sequence_set_metadata_schema(&r1, "", 0), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); CU_ASSERT_EQUAL(tsk_reference_sequence_set_url(&r1, "", 0), 0); CU_ASSERT_FALSE(tsk_reference_sequence_is_null(&r1)); CU_ASSERT_EQUAL(tsk_reference_sequence_set_data(&r1, "", 0), 0); CU_ASSERT_TRUE(tsk_reference_sequence_is_null(&r1)); tsk_reference_sequence_free(&r1); } static void test_reference_sequence_take(void) { int ret; tsk_reference_sequence_t r1; tsk_reference_sequence_t r2; const char *const_data = "data"; const char *const_metadata = "metadata"; char *takeset_data = strdup(const_data); char *takeset_metadata = strdup(const_metadata); ret = tsk_reference_sequence_init(&r1, 0); ret = tsk_reference_sequence_set_data(&r1, const_data, strlen(const_data)); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_set_metadata( &r1, const_metadata, strlen(const_metadata)); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_init(&r2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_takeset_data(&r2, takeset_data, strlen(takeset_data)); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_takeset_metadata( &r2, takeset_metadata, strlen(takeset_metadata)); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); /* Writing over these with copies doesn't lose memory */ ret = tsk_reference_sequence_set_data(&r2, const_data, strlen(const_data)); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_set_metadata( &r2, const_metadata, strlen(const_metadata)); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); /* The original copies are gone, make some new ones */ takeset_data = strdup(const_data); takeset_metadata = strdup(const_metadata); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_takeset_data(&r1, takeset_data, strlen(takeset_data)); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_takeset_metadata( &r1, takeset_metadata, strlen(takeset_metadata)); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); tsk_reference_sequence_free(&r1); tsk_reference_sequence_free(&r2); } static void test_reference_sequence(void) { int ret; tsk_reference_sequence_t r1; tsk_reference_sequence_t r2; const char example_data[100] = "An example string with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_data_length = (tsk_size_t) strlen(example_data); const char example_url[100] = "An example url with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_url_length = (tsk_size_t) strlen(example_url); const char example_metadata[100] = "An example metadata with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_metadata_length = (tsk_size_t) strlen(example_metadata); const char example_schema[100] = "An example schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_schema_length = (tsk_size_t) strlen(example_schema); tsk_reference_sequence_init(&r1, 0); tsk_reference_sequence_init(&r2, 0); /* NULL sequences are initially equal */ CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_data(&r1, example_data, example_data_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_data(&r1, "", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_data(&r2, "", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_data(&r1, example_data, example_data_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_data(&r2, example_data, example_data_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_url(&r1, example_url, example_url_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_url(&r2, example_url, example_url_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_metadata( &r1, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_reference_sequence_equals(&r1, &r2, 0)); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, TSK_CMP_IGNORE_METADATA)); ret = tsk_reference_sequence_set_metadata( &r2, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, TSK_CMP_IGNORE_METADATA)); ret = tsk_reference_sequence_set_metadata_schema( &r1, example_schema, example_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_reference_sequence_equals(&r1, &r2, 0)); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, TSK_CMP_IGNORE_METADATA)); ret = tsk_reference_sequence_set_metadata_schema( &r2, example_schema, example_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, TSK_CMP_IGNORE_METADATA)); // Test copy tsk_reference_sequence_free(&r1); tsk_reference_sequence_free(&r2); tsk_reference_sequence_init(&r1, 0); ret = tsk_reference_sequence_set_data(&r1, example_data, example_data_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_copy(&r1, &r2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_url(&r1, example_url, example_url_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_copy(&r1, &r2, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_metadata( &r1, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_copy(&r1, &r2, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); ret = tsk_reference_sequence_set_metadata_schema( &r1, example_schema, example_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_copy(&r1, &r2, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_reference_sequence_equals(&r1, &r2, 0)); tsk_reference_sequence_free(&r1); tsk_reference_sequence_free(&r2); } static void test_table_collection_reference_sequence(void) { int ret; tsk_table_collection_t tc1, tc2; char example_data[100] = "An example string with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_data_length = (tsk_size_t) strlen(example_data); char example_url[100] = "An example url with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_url_length = (tsk_size_t) strlen(example_url); char example_metadata[100] = "An example metadata with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_metadata_length = (tsk_size_t) strlen(example_metadata); char example_schema[100] = "An example schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_schema_length = (tsk_size_t) strlen(example_schema); // Test equality ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_init(&tc2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_data( &tc1.reference_sequence, example_data, example_data_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_data( &tc2.reference_sequence, example_data, example_data_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_url( &tc1.reference_sequence, example_url, example_url_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_url( &tc2.reference_sequence, example_url, example_url_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_metadata( &tc1.reference_sequence, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_metadata( &tc2.reference_sequence, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_metadata_schema( &tc1.reference_sequence, example_schema, example_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_metadata_schema( &tc2.reference_sequence, example_schema, example_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); // Test copy tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_set_data( &tc1.reference_sequence, example_data, example_data_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_copy(&tc1, &tc2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_url( &tc1.reference_sequence, example_url, example_url_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_copy(&tc1, &tc2, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_metadata( &tc1.reference_sequence, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_copy(&tc1, &tc2, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_reference_sequence_set_metadata_schema( &tc1.reference_sequence, example_schema, example_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_copy(&tc1, &tc2, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); // Test dump and load ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tc1.sequence_length = 1.0; ret = tsk_reference_sequence_set_data( &tc1.reference_sequence, example_data, example_data_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_set_url( &tc1.reference_sequence, example_url, example_url_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_set_metadata( &tc1.reference_sequence, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_reference_sequence_set_metadata_schema( &tc1.reference_sequence, example_schema, example_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_dump(&tc1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tc2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); } static void test_table_collection_has_reference_sequence(void) { int ret; tsk_table_collection_t tc; ret = tsk_table_collection_init(&tc, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tc.sequence_length = 1.0; CU_ASSERT_FALSE(tsk_table_collection_has_reference_sequence(&tc)); ret = tsk_reference_sequence_set_data(&tc.reference_sequence, "A", 1); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_has_reference_sequence(&tc)); /* Goes back to NULL by setting a empty string. See * test_reference_sequence_state_machine for detailed tests. */ ret = tsk_reference_sequence_set_data(&tc.reference_sequence, "", 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_has_reference_sequence(&tc)); tsk_table_collection_free(&tc); } static void test_table_collection_metadata(void) { int ret; tsk_table_collection_t tc1, tc2; char example_metadata[100] = "An example of metadata with unicode 🎄🌳🌴🌲🎋"; char *takeset_metadata; char example_metadata_schema[100] = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_metadata_length = (tsk_size_t) strlen(example_metadata); tsk_size_t example_metadata_schema_length = (tsk_size_t) strlen(example_metadata_schema); // Test equality ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_init(&tc2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_table_collection_set_metadata( &tc1, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_table_collection_set_metadata( &tc2, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_table_collection_set_metadata_schema( &tc1, example_metadata_schema, example_metadata_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_table_collection_set_metadata_schema( &tc2, example_metadata_schema, example_metadata_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); // Test copy tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_metadata( &tc1, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_copy(&tc1, &tc2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_table_collection_set_metadata_schema( &tc1, example_metadata_schema, example_metadata_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tc2); ret = tsk_table_collection_copy(&tc1, &tc2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); // Test dump and load with empty metadata and schema tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tc1.sequence_length = 1.0; ret = tsk_table_collection_dump(&tc1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tc2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); // Test dump and load with set metadata and schema tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tc1.sequence_length = 1.0; ret = tsk_table_collection_set_metadata( &tc1, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_metadata_schema( &tc1, example_metadata_schema, example_metadata_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_dump(&tc1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tc2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); takeset_metadata = tsk_malloc(example_metadata_length * sizeof(char)); CU_ASSERT_FATAL(takeset_metadata != NULL); memcpy(takeset_metadata, &example_metadata, (size_t) (example_metadata_length * sizeof(char))); ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_takeset_metadata( &tc1, takeset_metadata, example_metadata_length); CU_ASSERT_EQUAL( tsk_memcmp(tc1.metadata, &example_metadata, example_metadata_length), 0); tsk_table_collection_free(&tc1); } static void test_table_collection_time_units(void) { int ret; tsk_table_collection_t tc1, tc2; char example_time_units[100] = "An example of time units with unicode ⏰"; tsk_size_t example_time_units_length = (tsk_size_t) strlen(example_time_units); // Test equality ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_init(&tc2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_table_collection_set_time_units( &tc1, example_time_units, example_time_units_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); ret = tsk_table_collection_set_time_units( &tc2, example_time_units, example_time_units_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); // Test copy tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_time_units( &tc1, example_time_units, example_time_units_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_copy(&tc1, &tc2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); // Test dump and load with default time_units tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ret, strncmp(tc1.time_units, TSK_TIME_UNITS_UNKNOWN, 7)); tc1.sequence_length = 1.0; ret = tsk_table_collection_dump(&tc1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tc2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); // Test dump and load with set time_units and schema tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); ret = tsk_table_collection_init(&tc1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tc1.sequence_length = 1.0; ret = tsk_table_collection_set_time_units( &tc1, example_time_units, example_time_units_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_dump(&tc1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tc2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); } static void test_node_table(void) { int ret; tsk_id_t ret_id; tsk_node_table_t table, table2; tsk_node_t node, node2; tsk_size_t num_rows = 100; tsk_id_t j; tsk_flags_t *flags; tsk_id_t *population; double *time; tsk_id_t *individual; char *metadata; tsk_size_t *metadata_offset; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; char metadata_copy[test_metadata_length + 1]; tsk_id_t row_subset[6] = { 1, 9, 1, 0, 2, 2 }; tsk_size_t num_row_subset = 6; metadata_copy[test_metadata_length] = '\0'; ret = tsk_node_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_node_table_set_max_rows_increment(&table, 1); tsk_node_table_set_max_metadata_length_increment(&table, 1); tsk_node_table_print_state(&table, _devnull); ret = tsk_node_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_rows; j++) { ret_id = tsk_node_table_add_row(&table, (tsk_flags_t) j, (double) j, j, j, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, j); CU_ASSERT_EQUAL(table.flags[j], (tsk_flags_t) j); CU_ASSERT_EQUAL(table.time[j], j); CU_ASSERT_EQUAL(table.population[j], j); CU_ASSERT_EQUAL(table.individual[j], j); CU_ASSERT_EQUAL(table.num_rows, (tsk_size_t) j + 1); CU_ASSERT_EQUAL( table.metadata_length, (tsk_size_t) (j + 1) * test_metadata_length); CU_ASSERT_EQUAL(table.metadata_offset[j + 1], table.metadata_length); /* check the metadata */ tsk_memcpy(metadata_copy, table.metadata + table.metadata_offset[j], test_metadata_length); CU_ASSERT_NSTRING_EQUAL(metadata_copy, test_metadata, test_metadata_length); ret = tsk_node_table_get_row(&table, (tsk_id_t) j, &node); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(node.id, j); CU_ASSERT_EQUAL(node.flags, (tsk_size_t) j); CU_ASSERT_EQUAL(node.time, j); CU_ASSERT_EQUAL(node.population, j); CU_ASSERT_EQUAL(node.individual, j); CU_ASSERT_EQUAL(node.metadata_length, test_metadata_length); CU_ASSERT_NSTRING_EQUAL(node.metadata, test_metadata, test_metadata_length); } /* Test equality with and without metadata */ tsk_node_table_copy(&table, &table2, 0); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the metadata values */ table2.metadata[0] = 0; CU_ASSERT_FALSE(tsk_node_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the last metadata entry */ table2.metadata_offset[table2.num_rows] = table2.metadata_offset[table2.num_rows - 1]; CU_ASSERT_FALSE(tsk_node_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Delete all metadata */ tsk_memset(table2.metadata_offset, 0, (table2.num_rows + 1) * sizeof(*table2.metadata_offset)); CU_ASSERT_FALSE(tsk_node_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_node_table_free(&table2); CU_ASSERT_EQUAL(tsk_node_table_get_row(&table, (tsk_id_t) num_rows, &node), TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_node_table_print_state(&table, _devnull); ret = tsk_node_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_node_table_clear(&table); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); num_rows *= 2; flags = tsk_malloc(num_rows * sizeof(tsk_flags_t)); CU_ASSERT_FATAL(flags != NULL); tsk_memset(flags, 1, num_rows * sizeof(tsk_flags_t)); population = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(population != NULL); tsk_memset(population, 2, num_rows * sizeof(tsk_id_t)); time = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(time != NULL); tsk_memset(time, 0, num_rows * sizeof(double)); individual = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(individual != NULL); tsk_memset(individual, 3, num_rows * sizeof(tsk_id_t)); metadata = tsk_malloc(num_rows * sizeof(char)); tsk_memset(metadata, 'a', num_rows * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); for (j = 0; j < (tsk_id_t) num_rows + 1; j++) { metadata_offset[j] = (tsk_size_t) j; } ret = tsk_node_table_set_columns(&table, num_rows, flags, time, population, individual, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.population, population, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.individual, individual, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); tsk_node_table_print_state(&table, _devnull); ret = tsk_node_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Append another num_rows onto the end */ ret = tsk_node_table_append_columns(&table, num_rows, flags, time, population, individual, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.flags + num_rows, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.population, population, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.population + num_rows, population, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.time + num_rows, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.individual, individual, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.individual + num_rows, individual, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata + num_rows, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.metadata_length, 2 * num_rows); tsk_node_table_print_state(&table, _devnull); ret = tsk_node_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Truncate back to the original number of rows. */ ret = tsk_node_table_truncate(&table, num_rows); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.population, population, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.individual, individual, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); ret = tsk_node_table_truncate(&table, num_rows + 1); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TABLE_POSITION); /* If population is NULL it should be set to -1. If metadata is NULL all metadatas * should be set to the empty string. If individual is NULL it should be set to -1. */ num_rows = 10; tsk_memset(population, 0xff, num_rows * sizeof(tsk_id_t)); tsk_memset(individual, 0xff, num_rows * sizeof(tsk_id_t)); ret = tsk_node_table_set_columns( &table, num_rows, flags, time, NULL, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.population, population, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.individual, individual, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, num_rows * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); /* flags and time cannot be NULL */ ret = tsk_node_table_set_columns( &table, num_rows, NULL, time, population, individual, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_node_table_set_columns(&table, num_rows, flags, NULL, population, individual, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_node_table_set_columns( &table, num_rows, flags, time, population, individual, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_node_table_set_columns( &table, num_rows, flags, time, population, individual, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* if metadata and metadata_offset are both null, all metadatas are zero length */ num_rows = 10; tsk_memset(metadata_offset, 0, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_node_table_set_columns( &table, num_rows, flags, time, NULL, NULL, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.metadata_length, 0); ret = tsk_node_table_append_columns( &table, num_rows, flags, time, NULL, NULL, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.flags + num_rows, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.time + num_rows, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset + num_rows, metadata_offset, num_rows * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.metadata_length, 0); tsk_node_table_print_state(&table, _devnull); ret = tsk_node_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Test extend method */ ret = tsk_node_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_init(&table2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Can't extend from self */ ret = tsk_node_table_extend(&table, &table, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANNOT_EXTEND_FROM_SELF); /* Two empty tables */ CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, 0)); ret = tsk_node_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, 0)); /* Row out of bounds */ ret = tsk_node_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); /* Num rows out of bounds */ ret = tsk_node_table_extend(&table, &table2, num_rows * 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); /* Copy rows in order if index NULL */ ret = tsk_node_table_set_columns(&table2, num_rows, flags, time, population, individual, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_node_table_equals(&table, &table2, 0)); ret = tsk_node_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, 0)); /* Copy nothing if index not NULL but length zero */ ret = tsk_node_table_extend(&table, &table2, 0, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, 0)); /* Copy first N rows in order if index NULL */ ret = tsk_node_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_extend(&table, &table2, num_rows / 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_truncate(&table2, num_rows / 2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, 0)); ret = tsk_node_table_set_columns(&table2, num_rows, flags, time, population, individual, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Copy a subset */ ret = tsk_node_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_node_table_equals(&table, &table2, 0)); ret = tsk_node_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_row_subset; j++) { ret = tsk_node_table_get_row(&table, j, &node); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_get_row(&table2, row_subset[j], &node2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(node.flags, node2.flags); CU_ASSERT_EQUAL(node.time, node2.time); CU_ASSERT_EQUAL(node.population, node2.population); CU_ASSERT_EQUAL(node.individual, node2.individual); CU_ASSERT_EQUAL(node.metadata_length, node2.metadata_length); CU_ASSERT_EQUAL(tsk_memcmp(node.metadata, node2.metadata, node.metadata_length * sizeof(*node.metadata)), 0); } ret = tsk_node_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(table.metadata_schema_length, 0); CU_ASSERT_EQUAL(table.metadata_schema, NULL); const char *example = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_length = (tsk_size_t) strlen(example); const char *example2 = "A different example 🎄🌳🌴🌲🎋"; tsk_size_t example2_length = (tsk_size_t) strlen(example); tsk_node_table_set_metadata_schema(&table, example, example_length); CU_ASSERT_EQUAL(table.metadata_schema_length, example_length); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_schema, example, example_length), 0); tsk_node_table_copy(&table, &table2, TSK_NO_INIT); CU_ASSERT_EQUAL(table.metadata_schema_length, table2.metadata_schema_length); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_schema, table2.metadata_schema, example_length), 0); tsk_node_table_set_metadata_schema(&table2, example, example_length); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, 0)); tsk_node_table_set_metadata_schema(&table2, example2, example2_length); CU_ASSERT_FALSE(tsk_node_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_node_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_node_table_clear(&table); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); tsk_node_table_free(&table); CU_ASSERT_EQUAL(ret, 0); tsk_node_table_free(&table2); CU_ASSERT_EQUAL(ret, 0); free(flags); free(population); free(time); free(metadata); free(metadata_offset); free(individual); } static void test_node_table_takeset(void) { int ret = 0; tsk_id_t ret_id; tsk_node_table_t source_table, table; tsk_size_t num_rows = 100; tsk_id_t j; tsk_flags_t *flags; double *time; tsk_id_t *population; tsk_id_t *individual; char *metadata; tsk_size_t *metadata_offset; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; tsk_size_t zeros[num_rows + 1]; tsk_id_t neg_ones[num_rows]; tsk_memset(zeros, 0, (num_rows + 1) * sizeof(tsk_size_t)); tsk_memset(neg_ones, 0xff, num_rows * sizeof(tsk_id_t)); /* Make a table to copy from */ ret = tsk_node_table_init(&source_table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_rows; j++) { ret_id = tsk_node_table_add_row(&source_table, (tsk_flags_t) j, (double) j + 1, j + 2, j + 3, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, j); } /* Prepare arrays to be taken */ flags = tsk_malloc(num_rows * sizeof(tsk_flags_t)); CU_ASSERT_FATAL(flags != NULL); tsk_memcpy(flags, source_table.flags, num_rows * sizeof(tsk_flags_t)); time = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(time != NULL); tsk_memcpy(time, source_table.time, num_rows * sizeof(double)); population = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(population != NULL); tsk_memcpy(population, source_table.population, num_rows * sizeof(tsk_id_t)); individual = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(individual != NULL); tsk_memcpy(individual, source_table.individual, num_rows * sizeof(tsk_id_t)); metadata = tsk_malloc(num_rows * test_metadata_length * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); tsk_memcpy( metadata, source_table.metadata, num_rows * test_metadata_length * sizeof(char)); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); tsk_memcpy(metadata_offset, source_table.metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_node_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add one row so that we can check takeset frees it */ ret_id = tsk_node_table_add_row( &table, (tsk_flags_t) 1, 2, 3, 4, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_node_table_takeset_columns(&table, num_rows, flags, time, population, individual, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_node_table_equals(&source_table, &table, 0)); /* Test error states, all of these must not take the array, or free existing */ /* metadata and metadata offset must be simultaneously NULL or not */ ret = tsk_node_table_takeset_columns( &table, num_rows, NULL, time, population, individual, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_node_table_takeset_columns(&table, num_rows, flags, NULL, population, individual, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_node_table_takeset_columns( &table, num_rows, flags, time, population, individual, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_node_table_takeset_columns( &table, num_rows, flags, time, population, individual, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Truncation after takeset keeps memory and max_rows */ ret = tsk_node_table_clear(&table); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(table.max_rows, num_rows); flags = tsk_malloc(num_rows * sizeof(tsk_flags_t)); CU_ASSERT_FATAL(flags != NULL); tsk_memcpy(flags, source_table.flags, num_rows * sizeof(tsk_flags_t)); time = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(time != NULL); tsk_memcpy(time, source_table.time, num_rows * sizeof(double)); /* if metadata and offset are both null, all entries are zero length, individual and population default to -1 */ num_rows = 10; ret = tsk_node_table_takeset_columns( &table, num_rows, flags, time, NULL, NULL, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL( tsk_memcmp(table.population, neg_ones, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.individual, neg_ones, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.metadata_length, 0); ret = tsk_node_table_free(&table); CU_ASSERT_EQUAL(ret, 0); ret = tsk_node_table_free(&source_table); CU_ASSERT_EQUAL(ret, 0); } static void test_node_table_update_row(void) { int ret; tsk_id_t ret_id; tsk_node_table_t table; tsk_node_t row; const char *metadata = "ABC"; ret = tsk_node_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row(&table, 0, 1.0, 2, 3, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&table, 1, 2.0, 3, 4, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&table, 2, 3.0, 4, 5, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_node_table_update_row(&table, 0, 1, 2.0, 3, 4, &metadata[1], 1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 1); CU_ASSERT_EQUAL_FATAL(row.time, 2.0); CU_ASSERT_EQUAL_FATAL(row.population, 3); CU_ASSERT_EQUAL_FATAL(row.individual, 4); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_node_table_update_row(&table, 0, row.flags + 1, row.time + 1, row.population + 1, row.individual + 1, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 2); CU_ASSERT_EQUAL_FATAL(row.time, 3.0); CU_ASSERT_EQUAL_FATAL(row.population, 4); CU_ASSERT_EQUAL_FATAL(row.individual, 5); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_node_table_update_row(&table, 0, 0, 0, 0, 0, metadata, 3); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 0); CU_ASSERT_EQUAL_FATAL(row.time, 0); CU_ASSERT_EQUAL_FATAL(row.population, 0); CU_ASSERT_EQUAL_FATAL(row.individual, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_node_table_update_row(&table, 1, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_get_row(&table, 1, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 0); CU_ASSERT_EQUAL_FATAL(row.time, 0); CU_ASSERT_EQUAL_FATAL(row.population, 0); CU_ASSERT_EQUAL_FATAL(row.individual, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 0); ret = tsk_node_table_get_row(&table, 2, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 2); CU_ASSERT_EQUAL_FATAL(row.time, 3.0); CU_ASSERT_EQUAL_FATAL(row.population, 4); CU_ASSERT_EQUAL_FATAL(row.individual, 5); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_node_table_update_row(&table, 3, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_node_table_free(&table); } static void test_node_table_keep_rows(void) { int ret; tsk_id_t ret_id; tsk_size_t j; tsk_node_table_t source, t1, t2; tsk_node_t row; tsk_bool_t keep[3] = { 1, 1, 1 }; tsk_id_t id_map[3]; const char *metadata = "ABC"; tsk_id_t indexes[] = { 0, 1, 2 }; ret = tsk_node_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row(&source, 0, 1.0, 2, 3, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&source, 1, 2.0, 3, 4, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&source, 2, 3.0, 4, 5, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_node_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_node_table_equals(&t1, &source, 0)); ret = tsk_node_table_keep_rows(&t1, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_node_table_equals(&t1, &source, 0)); CU_ASSERT_EQUAL_FATAL(id_map[0], 0); CU_ASSERT_EQUAL_FATAL(id_map[1], 1); CU_ASSERT_EQUAL_FATAL(id_map[2], 2); keep[0] = 0; keep[1] = 0; keep[2] = 0; ret = tsk_node_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 0); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], -1); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_node_table_copy(&source, &t1, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = 0; keep[1] = 1; keep[2] = 0; ret = tsk_node_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 1); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], 0); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_node_table_get_row(&t1, 0, &row); CU_ASSERT_EQUAL_FATAL(row.flags, 1); CU_ASSERT_EQUAL_FATAL(row.time, 2.0); CU_ASSERT_EQUAL_FATAL(row.population, 3); CU_ASSERT_EQUAL_FATAL(row.individual, 4); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 2); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); tsk_node_table_free(&t1); keep[0] = 0; keep[1] = 0; keep[2] = 0; /* Keeping first n rows equivalent to truncate */ for (j = 0; j < source.num_rows; j++) { ret = tsk_node_table_copy(&source, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_truncate(&t1, j + 1); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[j] = 1; ret = tsk_node_table_keep_rows(&t2, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_node_table_equals(&t1, &t2, 0)); /* Adding the remaining rows back on to the table gives the original * table */ ret = tsk_node_table_extend( &t2, &source, source.num_rows - j - 1, indexes + j + 1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_node_table_equals(&source, &t2, 0)); tsk_node_table_free(&t1); tsk_node_table_free(&t2); } tsk_node_table_free(&source); } static void test_edge_table_with_options(tsk_flags_t options) { int ret; tsk_edge_table_t table, table2; tsk_size_t num_rows = 100; tsk_id_t j, ret_id; tsk_edge_t edge, edge2; tsk_id_t *parent, *child; double *left, *right; char *metadata; tsk_size_t *metadata_offset; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; char metadata_copy[test_metadata_length + 1]; tsk_id_t row_subset[6] = { 1, 9, 1, 0, 2, 2 }; tsk_size_t num_row_subset = 6; metadata_copy[test_metadata_length] = '\0'; ret = tsk_edge_table_init(&table, options); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_edge_table_set_max_rows_increment(&table, 1); tsk_edge_table_set_max_metadata_length_increment(&table, 1); tsk_edge_table_print_state(&table, _devnull); ret = tsk_edge_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_rows; j++) { if (options & TSK_TABLE_NO_METADATA) { ret_id = tsk_edge_table_add_row(&table, (double) j, (double) j, j, j, test_metadata, test_metadata_length); CU_ASSERT_EQUAL(ret_id, TSK_ERR_METADATA_DISABLED); ret_id = tsk_edge_table_add_row(&table, (double) j, (double) j, j, j, NULL, 0); } else { ret_id = tsk_edge_table_add_row(&table, (double) j, (double) j, j, j, test_metadata, test_metadata_length); } CU_ASSERT_EQUAL_FATAL(ret_id, j); CU_ASSERT_EQUAL(table.left[j], j); CU_ASSERT_EQUAL(table.right[j], j); CU_ASSERT_EQUAL(table.parent[j], j); CU_ASSERT_EQUAL(table.child[j], j); CU_ASSERT_EQUAL(table.num_rows, (tsk_size_t) j + 1); if (options & TSK_TABLE_NO_METADATA) { CU_ASSERT_EQUAL(table.metadata_length, 0); CU_ASSERT_EQUAL(table.metadata, NULL); CU_ASSERT_EQUAL(table.metadata_offset, NULL); } else { CU_ASSERT_EQUAL( table.metadata_length, (tsk_size_t) (j + 1) * test_metadata_length); CU_ASSERT_EQUAL(table.metadata_offset[j + 1], table.metadata_length); /* check the metadata */ tsk_memcpy(metadata_copy, table.metadata + table.metadata_offset[j], test_metadata_length); CU_ASSERT_NSTRING_EQUAL(metadata_copy, test_metadata, test_metadata_length); } ret = tsk_edge_table_get_row(&table, (tsk_id_t) j, &edge); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(edge.id, j); CU_ASSERT_EQUAL(edge.left, j); CU_ASSERT_EQUAL(edge.right, j); CU_ASSERT_EQUAL(edge.parent, j); CU_ASSERT_EQUAL(edge.child, j); if (options & TSK_TABLE_NO_METADATA) { CU_ASSERT_EQUAL(edge.metadata_length, 0); CU_ASSERT_EQUAL(edge.metadata, NULL); } else { CU_ASSERT_EQUAL(edge.metadata_length, test_metadata_length); CU_ASSERT_NSTRING_EQUAL(edge.metadata, test_metadata, test_metadata_length); } } ret = tsk_edge_table_get_row(&table, (tsk_id_t) num_rows, &edge); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); tsk_edge_table_print_state(&table, _devnull); ret = tsk_edge_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); num_rows *= 2; left = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(left != NULL); tsk_memset(left, 0, num_rows * sizeof(double)); right = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(right != NULL); tsk_memset(right, 0, num_rows * sizeof(double)); parent = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(parent != NULL); tsk_memset(parent, 1, num_rows * sizeof(tsk_id_t)); child = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(child != NULL); tsk_memset(child, 1, num_rows * sizeof(tsk_id_t)); metadata = tsk_malloc(num_rows * sizeof(char)); tsk_memset(metadata, 'a', num_rows * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); for (j = 0; j < (tsk_id_t) num_rows + 1; j++) { metadata_offset[j] = (tsk_size_t) j; } if (options & TSK_TABLE_NO_METADATA) { ret = tsk_edge_table_set_columns( &table, num_rows, left, right, parent, child, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_METADATA_DISABLED); ret = tsk_edge_table_set_columns( &table, num_rows, left, right, parent, child, NULL, NULL); } else { ret = tsk_edge_table_set_columns( &table, num_rows, left, right, parent, child, metadata, metadata_offset); } CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.left, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.right, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.child, child, num_rows * sizeof(tsk_id_t)), 0); if (options & TSK_TABLE_NO_METADATA) { CU_ASSERT_EQUAL(table.metadata, NULL); CU_ASSERT_EQUAL(table.metadata_offset, NULL); CU_ASSERT_EQUAL(table.metadata_length, 0); } else { CU_ASSERT_EQUAL( tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.metadata_length, num_rows); } CU_ASSERT_EQUAL(table.num_rows, num_rows); /* Append another num_rows to the end. */ if (options & TSK_TABLE_NO_METADATA) { ret = tsk_edge_table_append_columns( &table, num_rows, left, right, parent, child, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_METADATA_DISABLED); ret = tsk_edge_table_append_columns( &table, num_rows, left, right, parent, child, NULL, NULL); } else { ret = tsk_edge_table_append_columns( &table, num_rows, left, right, parent, child, metadata, metadata_offset); } CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.left, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.left + num_rows, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.right, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.right + num_rows, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parent + num_rows, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.child, child, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.child + num_rows, child, num_rows * sizeof(tsk_id_t)), 0); if (options & TSK_TABLE_NO_METADATA) { CU_ASSERT_EQUAL(table.metadata, NULL); CU_ASSERT_EQUAL(table.metadata_offset, NULL); CU_ASSERT_EQUAL(table.metadata_length, 0); } else { CU_ASSERT_EQUAL( tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata + num_rows, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.metadata_length, 2 * num_rows); } CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); /* Truncate back to num_rows */ ret = tsk_edge_table_truncate(&table, num_rows); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.left, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.right, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.child, child, num_rows * sizeof(tsk_id_t)), 0); if (options & TSK_TABLE_NO_METADATA) { CU_ASSERT_EQUAL(table.metadata, NULL); CU_ASSERT_EQUAL(table.metadata_offset, NULL); CU_ASSERT_EQUAL(table.metadata_length, 0); } else { CU_ASSERT_EQUAL( tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.metadata_length, num_rows); } CU_ASSERT_EQUAL(table.num_rows, num_rows); ret = tsk_edge_table_truncate(&table, num_rows + 1); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TABLE_POSITION); /* Test equality with and without metadata */ tsk_edge_table_copy(&table, &table2, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); if (!(options & TSK_TABLE_NO_METADATA)) { /* Change the metadata values */ table2.metadata[0] = 0; CU_ASSERT_FALSE(tsk_edge_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the last metadata entry */ table2.metadata_offset[table2.num_rows] = table2.metadata_offset[table2.num_rows - 1]; CU_ASSERT_FALSE(tsk_edge_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Delete all metadata */ tsk_memset(table2.metadata_offset, 0, (table2.num_rows + 1) * sizeof(*table2.metadata_offset)); CU_ASSERT_FALSE(tsk_edge_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); } tsk_edge_table_free(&table2); /* Inputs cannot be NULL */ ret = tsk_edge_table_set_columns( &table, num_rows, NULL, right, parent, child, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_edge_table_set_columns( &table, num_rows, left, NULL, parent, child, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_edge_table_set_columns( &table, num_rows, left, right, NULL, child, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_edge_table_set_columns( &table, num_rows, left, right, parent, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_edge_table_set_columns( &table, num_rows, left, right, parent, child, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_edge_table_set_columns( &table, num_rows, left, right, parent, child, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* if metadata and metadata_offset are both null, all metadatas are zero length */ num_rows = 10; tsk_memset(metadata_offset, 0, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_edge_table_set_columns( &table, num_rows, left, right, parent, child, NULL, NULL); CU_ASSERT_EQUAL(tsk_memcmp(table.left, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.right, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.child, child, num_rows * sizeof(tsk_id_t)), 0); if (options & TSK_TABLE_NO_METADATA) { CU_ASSERT_EQUAL(table.metadata, NULL); CU_ASSERT_EQUAL(table.metadata_offset, NULL); } else { CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); } CU_ASSERT_EQUAL(table.metadata_length, 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); ret = tsk_edge_table_append_columns( &table, num_rows, left, right, parent, child, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.left, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.left + num_rows, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.right, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.right + num_rows, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parent + num_rows, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.child, child, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.child + num_rows, child, num_rows * sizeof(tsk_id_t)), 0); if (options & TSK_TABLE_NO_METADATA) { CU_ASSERT_EQUAL(table.metadata, NULL); CU_ASSERT_EQUAL(table.metadata_offset, NULL); } else { CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset + num_rows, metadata_offset, num_rows * sizeof(tsk_size_t)), 0); } CU_ASSERT_EQUAL(table.metadata_length, 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); tsk_edge_table_print_state(&table, _devnull); ret = tsk_edge_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Test extend method */ ret = tsk_edge_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_init(&table2, options); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Can't extend from self */ ret = tsk_edge_table_extend(&table, &table, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANNOT_EXTEND_FROM_SELF); /* Two empty tables */ CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, 0)); ret = tsk_edge_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, 0)); /* Row out of bounds */ ret = tsk_edge_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); /* Num rows out of bounds */ ret = tsk_edge_table_extend(&table, &table2, num_rows * 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); /* Copy rows in order if index NULL */ if (options & TSK_TABLE_NO_METADATA) { ret = tsk_edge_table_set_columns( &table2, num_rows, left, right, parent, child, NULL, NULL); } else { ret = tsk_edge_table_set_columns( &table2, num_rows, left, right, parent, child, metadata, metadata_offset); } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_edge_table_equals(&table, &table2, 0)); ret = tsk_edge_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, 0)); /* Copy nothing if index not NULL but length zero */ ret = tsk_edge_table_extend(&table, &table2, 0, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, 0)); /* Copy first N rows in order if index NULL */ ret = tsk_edge_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_extend(&table, &table2, num_rows / 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_truncate(&table2, num_rows / 2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, 0)); if (options & TSK_TABLE_NO_METADATA) { ret = tsk_edge_table_set_columns( &table2, num_rows, left, right, parent, child, NULL, NULL); } else { ret = tsk_edge_table_set_columns( &table2, num_rows, left, right, parent, child, metadata, metadata_offset); } CU_ASSERT_EQUAL_FATAL(ret, 0); /* Copy a subset */ ret = tsk_edge_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_edge_table_equals(&table, &table2, 0)); ret = tsk_edge_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_row_subset; j++) { ret = tsk_edge_table_get_row(&table, j, &edge); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_get_row(&table2, row_subset[j], &edge2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(edge.parent, edge2.parent); CU_ASSERT_EQUAL(edge.child, edge2.child); CU_ASSERT_EQUAL(edge.left, edge2.left); CU_ASSERT_EQUAL(edge.right, edge2.right); CU_ASSERT_EQUAL(edge.metadata_length, edge2.metadata_length) CU_ASSERT_EQUAL(tsk_memcmp(edge.metadata, edge2.metadata, edge.metadata_length * sizeof(*edge.metadata)), 0); } ret = tsk_edge_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(table.metadata_schema_length, 0); CU_ASSERT_EQUAL(table.metadata_schema, NULL); const char *example = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_length = (tsk_size_t) strlen(example); const char *example2 = "A different example 🎄🌳🌴🌲🎋"; tsk_size_t example2_length = (tsk_size_t) strlen(example); ret = tsk_edge_table_set_metadata_schema(&table, example, example_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(table.metadata_schema_length, example_length); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_schema, example, example_length), 0); ret = tsk_edge_table_copy(&table, &table2, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(table.metadata_schema_length, table2.metadata_schema_length); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_schema, table2.metadata_schema, example_length), 0); ret = tsk_edge_table_set_metadata_schema(&table2, example, example_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, 0)); ret = tsk_edge_table_set_metadata_schema(&table2, example2, example2_length); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_edge_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_edge_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); ret = tsk_edge_table_clear(&table); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); ret = tsk_edge_table_free(&table); CU_ASSERT_EQUAL(ret, 0); ret = tsk_edge_table_free(&table2); CU_ASSERT_EQUAL(ret, 0); free(left); free(right); free(parent); free(child); free(metadata); free(metadata_offset); } static void test_edge_table(void) { test_edge_table_with_options(0); test_edge_table_with_options(TSK_TABLE_NO_METADATA); } static void test_edge_table_update_row(void) { int ret; tsk_id_t ret_id; tsk_edge_table_t table; tsk_edge_t row; const char *metadata = "ABC"; ret = tsk_edge_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&table, 0, 1.0, 2, 3, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&table, 1, 2.0, 3, 4, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&table, 2, 3.0, 4, 5, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_edge_table_update_row(&table, 0, 1, 2.0, 3, 4, &metadata[1], 1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 1); CU_ASSERT_EQUAL_FATAL(row.right, 2.0); CU_ASSERT_EQUAL_FATAL(row.parent, 3); CU_ASSERT_EQUAL_FATAL(row.child, 4); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_edge_table_update_row(&table, 0, row.left + 1, row.right + 1, row.parent + 1, row.child + 1, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 2); CU_ASSERT_EQUAL_FATAL(row.right, 3.0); CU_ASSERT_EQUAL_FATAL(row.parent, 4); CU_ASSERT_EQUAL_FATAL(row.child, 5); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_edge_table_update_row(&table, 0, 0, 0, 0, 0, metadata, 3); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 0); CU_ASSERT_EQUAL_FATAL(row.right, 0); CU_ASSERT_EQUAL_FATAL(row.parent, 0); CU_ASSERT_EQUAL_FATAL(row.child, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_edge_table_update_row(&table, 1, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_get_row(&table, 1, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 0); CU_ASSERT_EQUAL_FATAL(row.right, 0); CU_ASSERT_EQUAL_FATAL(row.parent, 0); CU_ASSERT_EQUAL_FATAL(row.child, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 0); ret = tsk_edge_table_get_row(&table, 2, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 2); CU_ASSERT_EQUAL_FATAL(row.right, 3.0); CU_ASSERT_EQUAL_FATAL(row.parent, 4); CU_ASSERT_EQUAL_FATAL(row.child, 5); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_edge_table_update_row(&table, 3, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); tsk_edge_table_free(&table); } static void test_edge_table_update_row_no_metadata(void) { int ret; tsk_id_t ret_id; tsk_edge_table_t table; tsk_edge_t row; const char *metadata = "ABC"; ret = tsk_edge_table_init(&table, TSK_TABLE_NO_METADATA); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&table, 0, 1.0, 2, 3, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&table, 1, 2.0, 3, 4, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&table, 2, 3.0, 4, 5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_edge_table_update_row(&table, 0, 1, 2.0, 3, 4, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 1); CU_ASSERT_EQUAL_FATAL(row.right, 2.0); CU_ASSERT_EQUAL_FATAL(row.parent, 3); CU_ASSERT_EQUAL_FATAL(row.child, 4); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 0); ret = tsk_edge_table_update_row(&table, 0, row.left + 1, row.right + 1, row.parent + 1, row.child + 1, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 2); CU_ASSERT_EQUAL_FATAL(row.right, 3.0); CU_ASSERT_EQUAL_FATAL(row.parent, 4); CU_ASSERT_EQUAL_FATAL(row.child, 5); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 0); ret = tsk_edge_table_update_row(&table, 1, 0, 0, 0, 0, metadata, 3); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_METADATA_DISABLED); tsk_edge_table_free(&table); } static void test_edge_table_keep_rows(void) { int ret; tsk_id_t ret_id; tsk_size_t j; tsk_edge_table_t source, t1, t2; tsk_edge_t row; tsk_bool_t keep[3] = { 1, 1, 1 }; tsk_id_t id_map[3]; const char *metadata = "ABC"; tsk_id_t indexes[] = { 0, 1, 2 }; ret = tsk_edge_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&source, 0, 1.0, 2, 3, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&source, 1, 2.0, 3, 4, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&source, 2, 3.0, 4, 5, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_edge_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&t1, &source, 0)); ret = tsk_edge_table_keep_rows(&t1, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&t1, &source, 0)); CU_ASSERT_EQUAL_FATAL(id_map[0], 0); CU_ASSERT_EQUAL_FATAL(id_map[1], 1); CU_ASSERT_EQUAL_FATAL(id_map[2], 2); keep[0] = 0; keep[1] = 0; keep[2] = 0; ret = tsk_edge_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 0); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], -1); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_edge_table_copy(&source, &t1, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = 0; keep[1] = 1; keep[2] = 0; ret = tsk_edge_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 1); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], 0); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_edge_table_get_row(&t1, 0, &row); CU_ASSERT_EQUAL_FATAL(row.left, 1); CU_ASSERT_EQUAL_FATAL(row.right, 2.0); CU_ASSERT_EQUAL_FATAL(row.parent, 3); CU_ASSERT_EQUAL_FATAL(row.child, 4); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 2); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); tsk_edge_table_free(&t1); keep[0] = 0; keep[1] = 0; keep[2] = 0; /* Keeping first n rows equivalent to truncate */ for (j = 0; j < source.num_rows; j++) { ret = tsk_edge_table_copy(&source, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_truncate(&t1, j + 1); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[j] = 1; ret = tsk_edge_table_keep_rows(&t2, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&t1, &t2, 0)); /* Adding the remaining rows back on to the table gives the original * table */ ret = tsk_edge_table_extend( &t2, &source, source.num_rows - j - 1, indexes + j + 1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&source, &t2, 0)); tsk_edge_table_free(&t1); tsk_edge_table_free(&t2); } tsk_edge_table_free(&source); } static void test_edge_table_keep_rows_no_metadata(void) { int ret; tsk_id_t ret_id; tsk_size_t j; tsk_edge_table_t source, t1, t2; tsk_edge_t row; tsk_bool_t keep[3] = { 1, 1, 1 }; tsk_id_t id_map[3]; tsk_id_t indexes[] = { 0, 1, 2 }; ret = tsk_edge_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&source, 0, 1.0, 2, 3, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&source, 1, 2.0, 3, 4, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&source, 2, 3.0, 4, 5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_edge_table_copy(&source, &t1, TSK_TABLE_NO_METADATA); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&t1, &source, 0)); ret = tsk_edge_table_keep_rows(&t1, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&t1, &source, 0)); CU_ASSERT_EQUAL_FATAL(id_map[0], 0); CU_ASSERT_EQUAL_FATAL(id_map[1], 1); CU_ASSERT_EQUAL_FATAL(id_map[2], 2); keep[0] = 0; keep[1] = 0; keep[2] = 0; ret = tsk_edge_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 0); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], -1); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_edge_table_copy(&source, &t1, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = 0; keep[1] = 1; keep[2] = 0; ret = tsk_edge_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 1); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], 0); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_edge_table_get_row(&t1, 0, &row); CU_ASSERT_EQUAL_FATAL(row.left, 1); CU_ASSERT_EQUAL_FATAL(row.right, 2.0); CU_ASSERT_EQUAL_FATAL(row.parent, 3); CU_ASSERT_EQUAL_FATAL(row.child, 4); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 0); tsk_edge_table_free(&t1); keep[0] = 0; keep[1] = 0; keep[2] = 0; /* Keeping first n rows equivalent to truncate */ for (j = 0; j < source.num_rows; j++) { ret = tsk_edge_table_copy(&source, &t2, TSK_TABLE_NO_METADATA); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_copy(&source, &t1, TSK_TABLE_NO_METADATA); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_truncate(&t1, j + 1); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[j] = 1; ret = tsk_edge_table_keep_rows(&t2, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&t1, &t2, 0)); /* Adding the remaining rows back on to the table gives the original * table */ ret = tsk_edge_table_extend( &t2, &source, source.num_rows - j - 1, indexes + j + 1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&source, &t2, 0)); tsk_edge_table_free(&t1); tsk_edge_table_free(&t2); } tsk_edge_table_free(&source); } static void test_edge_table_takeset_with_options(tsk_flags_t table_options) { int ret = 0; tsk_id_t ret_id; tsk_edge_table_t source_table, table; tsk_size_t num_rows = 100; tsk_id_t j; double *left; double *right; tsk_id_t *parent; tsk_id_t *child; char *metadata; tsk_size_t *metadata_offset; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; tsk_size_t zeros[num_rows + 1]; tsk_id_t neg_ones[num_rows]; tsk_memset(zeros, 0, (num_rows + 1) * sizeof(tsk_size_t)); tsk_memset(neg_ones, 0xff, num_rows * sizeof(tsk_id_t)); /* Make a table to copy from */ ret = tsk_edge_table_init(&source_table, table_options); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_rows; j++) { if (table_options & TSK_TABLE_NO_METADATA) { ret_id = tsk_edge_table_add_row( &source_table, (double) j, (double) j + 1, j + 2, j + 3, NULL, 0); } else { ret_id = tsk_edge_table_add_row(&source_table, (double) j, (double) j + 1, j + 2, j + 3, test_metadata, test_metadata_length); } CU_ASSERT_EQUAL_FATAL(ret_id, j); } /* Prepare arrays to be taken */ left = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(left != NULL); tsk_memcpy(left, source_table.left, num_rows * sizeof(double)); right = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(right != NULL); tsk_memcpy(right, source_table.right, num_rows * sizeof(double)); parent = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(parent != NULL); tsk_memcpy(parent, source_table.parent, num_rows * sizeof(tsk_id_t)); child = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(child != NULL); tsk_memcpy(child, source_table.child, num_rows * sizeof(tsk_id_t)); if (table_options & TSK_TABLE_NO_METADATA) { metadata = NULL; metadata_offset = NULL; test_metadata = NULL; test_metadata_length = 0; } else { metadata = tsk_malloc(num_rows * test_metadata_length * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); tsk_memcpy(metadata, source_table.metadata, num_rows * test_metadata_length * sizeof(char)); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); tsk_memcpy(metadata_offset, source_table.metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)); } ret = tsk_edge_table_init(&table, table_options); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add one row so that we can check takeset frees it */ ret_id = tsk_edge_table_add_row( &table, 1, 2, 3, 4, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_edge_table_takeset_columns( &table, num_rows, left, right, parent, child, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&source_table, &table, 0)); /* Test error states, all of these must not take the array, or free existing */ /* metadata and metadata offset must be simultaneously NULL or not */ ret = tsk_edge_table_takeset_columns( &table, num_rows, NULL, right, parent, child, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_edge_table_takeset_columns( &table, num_rows, left, NULL, parent, child, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_edge_table_takeset_columns( &table, num_rows, left, right, NULL, child, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_edge_table_takeset_columns( &table, num_rows, left, right, parent, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); if (table_options & TSK_TABLE_NO_METADATA) { /* It isn't used, so any pointer does for testing that presence of metadata fails */ ret = tsk_edge_table_takeset_columns( &table, num_rows, left, right, parent, child, (char *) child, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_METADATA_DISABLED); } else { ret = tsk_edge_table_takeset_columns( &table, num_rows, left, right, parent, child, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_edge_table_takeset_columns( &table, num_rows, left, right, parent, child, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); } /* Truncation after takeset keeps memory and max_rows */ ret = tsk_edge_table_clear(&table); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(table.max_rows, num_rows); ret = tsk_edge_table_free(&table); CU_ASSERT_EQUAL(ret, 0); ret = tsk_edge_table_free(&source_table); CU_ASSERT_EQUAL(ret, 0); } static void test_edge_table_takeset(void) { test_edge_table_takeset_with_options(TSK_TABLE_NO_METADATA); test_edge_table_takeset_with_options(0); } static void test_edge_table_copy_semantics(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t t1, t2; tsk_edge_table_t edges; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); insert_edge_metadata(&t1); /* t1 now has metadata. We should be able to copy to another table with metadata */ ret = tsk_table_collection_copy(&t1, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); /* We should not be able to copy into a table with no metadata */ ret = tsk_table_collection_copy(&t1, &t2, TSK_TC_NO_EDGE_METADATA); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_METADATA_DISABLED); tsk_table_collection_free(&t2); tsk_table_collection_free(&t1); ret = tsk_treeseq_copy_tables(&ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* t1 has no metadata, but metadata is enabled. We should be able to copy * into a table with either metadata enabled or disabled. */ ret = tsk_table_collection_copy(&t1, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); ret = tsk_table_collection_copy(&t1, &t2, TSK_TC_NO_EDGE_METADATA); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); /* Try copying into a table directly */ ret = tsk_edge_table_copy(&t1.edges, &edges, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_edge_table_equals(&t1.edges, &edges, 0)); tsk_edge_table_free(&edges); tsk_table_collection_free(&t1); tsk_treeseq_free(&ts); } static void test_edge_table_squash(void) { int ret; tsk_table_collection_t tables; const char *nodes_ex = "1 0 -1 -1\n" "1 0 -1 -1\n" "0 0.253 -1 -1\n"; const char *edges_ex = "0 2 2 0\n" "2 10 2 0\n" "0 2 2 1\n" "2 10 2 1\n"; /* 2 / \ 0 1 */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 10; parse_nodes(nodes_ex, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 3); parse_edges(edges_ex, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 4); ret = tsk_edge_table_squash(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); // Check output. CU_ASSERT_EQUAL(tables.edges.num_rows, 2); // Free things. tsk_table_collection_free(&tables); } static void test_edge_table_squash_multiple_parents(void) { int ret; tsk_table_collection_t tables; const char *nodes_ex = "1 0.000 -1 -1\n" "1 0.000 -1 -1\n" "1 0.000 -1 -1\n" "1 0.000 -1 -1\n" "0 1.000 -1 -1\n" "0 1.000 -1 -1\n"; const char *edges_ex = "5 10 5 3\n" "5 10 5 2\n" "0 5 5 3\n" "0 5 5 2\n" "4 10 4 1\n" "0 4 4 1\n" "4 10 4 0\n" "0 4 4 0\n"; /* 4 5 / \ / \ 0 1 2 3 */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 10; parse_nodes(nodes_ex, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 6); parse_edges(edges_ex, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 8); ret = tsk_edge_table_squash(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); // Check output. CU_ASSERT_EQUAL(tables.edges.num_rows, 4); // Free things. tsk_table_collection_free(&tables); } static void test_edge_table_squash_empty(void) { int ret; tsk_table_collection_t tables; const char *nodes_ex = "1 0 -1 -1\n" "1 0 -1 -1\n" "0 0.253 -1 -1\n"; const char *edges_ex = ""; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 10; parse_nodes(nodes_ex, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 3); parse_edges(edges_ex, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 0); ret = tsk_edge_table_squash(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); // Free things. tsk_table_collection_free(&tables); } static void test_edge_table_squash_single_edge(void) { int ret; tsk_table_collection_t tables; const char *nodes_ex = "1 0 -1 -1\n" "0 0 -1 -1\n"; const char *edges_ex = "0 1 1 0\n"; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(nodes_ex, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 2); parse_edges(edges_ex, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 1); ret = tsk_edge_table_squash(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); // Free things. tsk_table_collection_free(&tables); } static void test_edge_table_squash_bad_intervals(void) { int ret; tsk_table_collection_t tables; const char *nodes_ex = "1 0 -1 -1\n" "0 0 -1 -1\n"; const char *edges_ex = "0 0.6 1 0\n" "0.4 1 1 0\n"; ret = tsk_table_collection_init(&tables, TSK_TC_NO_EDGE_METADATA); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(nodes_ex, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 2); parse_edges(edges_ex, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 2); ret = tsk_edge_table_squash(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN); // Free things. tsk_table_collection_free(&tables); } static void test_edge_table_squash_metadata(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 10; ret_id = tsk_edge_table_add_row(&tables.edges, 0, 0, 1, 1, "metadata", 8); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_edge_table_squash(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA); tsk_table_collection_free(&tables); ret = tsk_table_collection_init(&tables, TSK_TC_NO_EDGE_METADATA); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 10; ret_id = tsk_edge_table_add_row(&tables.edges, 0, 0, 1, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_edge_table_squash(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tables); } static void test_site_table(void) { int ret; tsk_id_t ret_id; tsk_site_table_t table, table2; tsk_size_t num_rows, j; char *ancestral_state; char *metadata; double *position; tsk_site_t site, site2; tsk_size_t *ancestral_state_offset; tsk_size_t *metadata_offset; tsk_id_t row_subset[6] = { 1, 9, 1, 0, 2, 2 }; tsk_size_t num_row_subset = 6; ret = tsk_site_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_site_table_set_max_rows_increment(&table, 1); tsk_site_table_set_max_metadata_length_increment(&table, 1); tsk_site_table_set_max_ancestral_state_length_increment(&table, 1); tsk_site_table_print_state(&table, _devnull); ret = tsk_site_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_site_table_add_row(&table, 0, "A", 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); CU_ASSERT_EQUAL(table.position[0], 0); CU_ASSERT_EQUAL(table.ancestral_state_offset[0], 0); CU_ASSERT_EQUAL(table.ancestral_state_offset[1], 1); CU_ASSERT_EQUAL(table.ancestral_state_length, 1); CU_ASSERT_EQUAL(table.metadata_offset[0], 0); CU_ASSERT_EQUAL(table.metadata_offset[1], 0); CU_ASSERT_EQUAL(table.metadata_length, 0); CU_ASSERT_EQUAL(table.num_rows, 1); ret = tsk_site_table_get_row(&table, 0, &site); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(site.position, 0); CU_ASSERT_EQUAL(site.ancestral_state_length, 1); CU_ASSERT_NSTRING_EQUAL(site.ancestral_state, "A", 1); CU_ASSERT_EQUAL(site.metadata_length, 0); ret_id = tsk_site_table_add_row(&table, 1, "AA", 2, "{}", 2); CU_ASSERT_EQUAL_FATAL(ret_id, 1); CU_ASSERT_EQUAL(table.position[1], 1); CU_ASSERT_EQUAL(table.ancestral_state_offset[2], 3); CU_ASSERT_EQUAL(table.metadata_offset[1], 0); CU_ASSERT_EQUAL(table.metadata_offset[2], 2); CU_ASSERT_EQUAL(table.metadata_length, 2); CU_ASSERT_EQUAL(table.num_rows, 2); ret = tsk_site_table_get_row(&table, 1, &site); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(site.position, 1); CU_ASSERT_EQUAL(site.ancestral_state_length, 2); CU_ASSERT_NSTRING_EQUAL(site.ancestral_state, "AA", 2); CU_ASSERT_EQUAL(site.metadata_length, 2); CU_ASSERT_NSTRING_EQUAL(site.metadata, "{}", 2); ret_id = tsk_site_table_add_row(&table, 2, "A", 1, "metadata", 8); CU_ASSERT_EQUAL_FATAL(ret_id, 2); CU_ASSERT_EQUAL(table.position[1], 1); CU_ASSERT_EQUAL(table.ancestral_state_offset[3], 4); CU_ASSERT_EQUAL(table.ancestral_state_length, 4); CU_ASSERT_EQUAL(table.metadata_offset[3], 10); CU_ASSERT_EQUAL(table.metadata_length, 10); CU_ASSERT_EQUAL(table.num_rows, 3); ret = tsk_site_table_get_row(&table, 3, &site); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tsk_site_table_print_state(&table, _devnull); ret = tsk_site_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_site_table_clear(&table); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.ancestral_state_length, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); CU_ASSERT_EQUAL(table.ancestral_state_offset[0], 0); CU_ASSERT_EQUAL(table.metadata_offset[0], 0); num_rows = 100; position = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(position != NULL); ancestral_state = tsk_malloc(num_rows * sizeof(char)); CU_ASSERT_FATAL(ancestral_state != NULL); ancestral_state_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(ancestral_state_offset != NULL); metadata = tsk_malloc(num_rows * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); for (j = 0; j < num_rows; j++) { position[j] = (double) j; ancestral_state[j] = (char) j; ancestral_state_offset[j] = (tsk_size_t) j; metadata[j] = (char) ('A' + j); metadata_offset[j] = (tsk_size_t) j; } ancestral_state_offset[num_rows] = num_rows; metadata_offset[num_rows] = num_rows; ret = tsk_site_table_set_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.position, position, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.ancestral_state, ancestral_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.ancestral_state_length, num_rows); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.metadata_length, num_rows); CU_ASSERT_EQUAL(table.num_rows, num_rows); /* Append another num rows */ ret = tsk_site_table_append_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.position, position, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.position + num_rows, position, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.ancestral_state, ancestral_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.ancestral_state + num_rows, ancestral_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata + num_rows, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.ancestral_state_length, 2 * num_rows); /* truncate back to num_rows */ ret = tsk_site_table_truncate(&table, num_rows); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.position, position, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.ancestral_state, ancestral_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.ancestral_state_length, num_rows); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.metadata_length, num_rows); CU_ASSERT_EQUAL(table.num_rows, num_rows); ret = tsk_site_table_truncate(&table, num_rows + 1); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TABLE_POSITION); /* Test equality with and without metadata */ tsk_site_table_copy(&table, &table2, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the metadata values */ table2.metadata[0] = 0; CU_ASSERT_FALSE(tsk_site_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the last metadata entry */ table2.metadata_offset[table2.num_rows] = table2.metadata_offset[table2.num_rows - 1]; CU_ASSERT_FALSE(tsk_site_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Delete all metadata */ tsk_memset(table2.metadata_offset, 0, (table2.num_rows + 1) * sizeof(*table2.metadata_offset)); CU_ASSERT_FALSE(tsk_site_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_site_table_free(&table2); /* Inputs cannot be NULL */ ret = tsk_site_table_set_columns(&table, num_rows, NULL, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_site_table_set_columns(&table, num_rows, position, NULL, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_site_table_set_columns( &table, num_rows, position, ancestral_state, NULL, metadata, metadata_offset); /* Metadata and metadata_offset must both be null */ ret = tsk_site_table_set_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_site_table_set_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Set metadata to NULL */ ret = tsk_site_table_set_columns( &table, num_rows, position, ancestral_state, ancestral_state_offset, NULL, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memset(metadata_offset, 0, (num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_EQUAL(tsk_memcmp(table.position, position, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.ancestral_state, ancestral_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.ancestral_state_length, num_rows); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.metadata_length, 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); /* Test extend method */ ret = tsk_site_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_init(&table2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Can't extend from self */ ret = tsk_site_table_extend(&table, &table, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANNOT_EXTEND_FROM_SELF); /* Two empty tables */ CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, 0)); ret = tsk_site_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, 0)); /* Row out of bounds */ ret = tsk_site_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); /* Num rows out of bounds */ ret = tsk_site_table_extend(&table, &table2, num_rows * 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); /* Copy rows in order if index NULL */ ret = tsk_site_table_set_columns(&table2, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_site_table_equals(&table, &table2, 0)); ret = tsk_site_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, 0)); /* Copy nothing if index not NULL but length zero */ ret = tsk_site_table_extend(&table, &table2, 0, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, 0)); /* Copy first N rows in order if index NULL */ ret = tsk_site_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_extend(&table, &table2, num_rows / 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_truncate(&table2, num_rows / 2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, 0)); ret = tsk_site_table_set_columns(&table2, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Copy a subset */ ret = tsk_site_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_site_table_equals(&table, &table2, 0)); ret = tsk_site_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_row_subset; j++) { ret = tsk_site_table_get_row(&table, (tsk_id_t) j, &site); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_get_row(&table2, row_subset[j], &site2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(site.position, site2.position); CU_ASSERT_EQUAL(site.ancestral_state_length, site2.ancestral_state_length); CU_ASSERT_EQUAL(site.metadata_length, site2.metadata_length); CU_ASSERT_EQUAL(tsk_memcmp(site.ancestral_state, site2.ancestral_state, site.ancestral_state_length * sizeof(*site.ancestral_state)), 0); CU_ASSERT_EQUAL(tsk_memcmp(site.metadata, site2.metadata, site.metadata_length * sizeof(*site.metadata)), 0); } /* Test for bad offsets */ ancestral_state_offset[0] = 1; ret = tsk_site_table_set_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); ancestral_state_offset[0] = 0; ancestral_state_offset[num_rows] = 0; ret = tsk_site_table_set_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); ancestral_state_offset[0] = 0; metadata_offset[0] = 0; ret = tsk_site_table_set_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); metadata_offset[0] = 0; metadata_offset[num_rows] = 0; ret = tsk_site_table_set_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); ret = tsk_site_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(table.metadata_schema_length, 0); CU_ASSERT_EQUAL(table.metadata_schema, NULL); const char *example = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_length = (tsk_size_t) strlen(example); const char *example2 = "A different example 🎄🌳🌴🌲🎋"; tsk_size_t example2_length = (tsk_size_t) strlen(example); tsk_site_table_set_metadata_schema(&table, example, example_length); CU_ASSERT_EQUAL(table.metadata_schema_length, example_length); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_schema, example, example_length), 0); tsk_site_table_copy(&table, &table2, TSK_NO_INIT); CU_ASSERT_EQUAL(table.metadata_schema_length, table2.metadata_schema_length); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_schema, table2.metadata_schema, example_length), 0); tsk_site_table_set_metadata_schema(&table2, example, example_length); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, 0)); tsk_site_table_set_metadata_schema(&table2, example2, example2_length); CU_ASSERT_FALSE(tsk_site_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_site_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); ret = tsk_site_table_clear(&table); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.ancestral_state_length, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); tsk_site_table_free(&table); CU_ASSERT_EQUAL(ret, 0); tsk_site_table_free(&table2); CU_ASSERT_EQUAL(ret, 0); free(position); free(ancestral_state); free(ancestral_state_offset); free(metadata); free(metadata_offset); } static void test_site_table_takeset(void) { int ret = 0; tsk_id_t ret_id; tsk_site_table_t source_table, table; tsk_size_t num_rows = 100; tsk_id_t j; double *position; char *ancestral_state; tsk_size_t *ancestral_state_offset; char *metadata; tsk_size_t *metadata_offset; const char *test_ancestral_state = "red"; tsk_size_t test_ancestral_state_length = 3; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; tsk_size_t zeros[num_rows + 1]; tsk_id_t neg_ones[num_rows]; tsk_memset(zeros, 0, (num_rows + 1) * sizeof(tsk_size_t)); tsk_memset(neg_ones, 0xff, num_rows * sizeof(tsk_id_t)); /* Make a table to copy from */ ret = tsk_site_table_init(&source_table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_rows; j++) { ret_id = tsk_site_table_add_row(&source_table, (double) j, test_ancestral_state, test_ancestral_state_length, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, j); } /* Prepare arrays to be taken */ position = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(position != NULL); tsk_memcpy(position, source_table.position, num_rows * sizeof(double)); ancestral_state = tsk_malloc(num_rows * test_ancestral_state_length * sizeof(char)); CU_ASSERT_FATAL(ancestral_state != NULL); tsk_memcpy(ancestral_state, source_table.ancestral_state, num_rows * test_ancestral_state_length * sizeof(char)); ancestral_state_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(ancestral_state_offset != NULL); tsk_memcpy(ancestral_state_offset, source_table.ancestral_state_offset, (num_rows + 1) * sizeof(tsk_size_t)); metadata = tsk_malloc(num_rows * test_metadata_length * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); tsk_memcpy( metadata, source_table.metadata, num_rows * test_metadata_length * sizeof(char)); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); tsk_memcpy(metadata_offset, source_table.metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_site_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add one row so that we can check takeset frees it */ ret_id = tsk_site_table_add_row(&table, 1, test_ancestral_state, test_ancestral_state_length, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_site_table_takeset_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&source_table, &table, 0)); /* Test error states, all of these must not take the array, or free existing */ /* metadata and metadata offset must be simultaneously NULL or not */ ret = tsk_site_table_takeset_columns(&table, num_rows, NULL, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_site_table_takeset_columns(&table, num_rows, position, NULL, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_site_table_takeset_columns( &table, num_rows, position, ancestral_state, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_site_table_takeset_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_site_table_takeset_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Check bad offset in ancestral_state */ ancestral_state_offset[0] = 1; ret = tsk_site_table_takeset_columns(&table, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); /* Truncation after takeset keeps memory and max_rows */ ret = tsk_site_table_clear(&table); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(table.max_rows, num_rows); position = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(position != NULL); tsk_memcpy(position, source_table.position, num_rows * sizeof(double)); ancestral_state = tsk_malloc(num_rows * test_ancestral_state_length * sizeof(char)); CU_ASSERT_FATAL(ancestral_state != NULL); tsk_memcpy(ancestral_state, source_table.ancestral_state, num_rows * test_ancestral_state_length * sizeof(char)); ancestral_state_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(ancestral_state_offset != NULL); tsk_memcpy(ancestral_state_offset, source_table.ancestral_state_offset, (num_rows + 1) * sizeof(tsk_size_t)); /* if metadata and offset are both null, all entries are zero length*/ num_rows = 10; ret = tsk_site_table_takeset_columns( &table, num_rows, position, ancestral_state, ancestral_state_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.metadata_length, 0); ret = tsk_site_table_free(&table); CU_ASSERT_EQUAL(ret, 0); ret = tsk_site_table_free(&source_table); CU_ASSERT_EQUAL(ret, 0); } static void test_site_table_update_row(void) { int ret; tsk_id_t ret_id; tsk_site_table_t table; tsk_site_t row; const char *ancestral_state = "XYZ"; const char *metadata = "ABC"; ret = tsk_site_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_site_table_add_row(&table, 0, ancestral_state, 1, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&table, 1, ancestral_state, 2, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&table, 2, ancestral_state, 3, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_site_table_update_row( &table, 0, 1, &ancestral_state[1], 1, &metadata[1], 1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.position, 1); CU_ASSERT_EQUAL_FATAL(row.ancestral_state_length, 1); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[0], 'Y'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_site_table_update_row(&table, 0, row.position + 1, row.ancestral_state, row.ancestral_state_length, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.position, 2); CU_ASSERT_EQUAL_FATAL(row.ancestral_state_length, 1); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[0], 'Y'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_site_table_update_row(&table, 0, row.position, row.ancestral_state, row.ancestral_state_length, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.position, 2); CU_ASSERT_EQUAL_FATAL(row.ancestral_state_length, 1); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[0], 'Y'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_site_table_update_row( &table, 0, row.position, NULL, 0, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.position, 2); CU_ASSERT_EQUAL_FATAL(row.ancestral_state_length, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_site_table_update_row(&table, 0, 2, ancestral_state, 3, metadata, 3); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.position, 2); CU_ASSERT_EQUAL_FATAL(row.ancestral_state_length, 3); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[0], 'X'); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[1], 'Y'); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[2], 'Z'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_site_table_update_row(&table, 1, 5, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_get_row(&table, 1, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.position, 5); CU_ASSERT_EQUAL_FATAL(row.ancestral_state_length, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 0); ret = tsk_site_table_get_row(&table, 2, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.position, 2); CU_ASSERT_EQUAL_FATAL(row.ancestral_state_length, 3); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[0], 'X'); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[1], 'Y'); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[2], 'Z'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_site_table_update_row(&table, 3, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tsk_site_table_free(&table); } static void test_site_table_keep_rows(void) { int ret; tsk_id_t ret_id; tsk_size_t j; tsk_site_table_t source, t1, t2; tsk_site_t row; const char *ancestral_state = "XYZ"; const char *metadata = "ABC"; tsk_bool_t keep[3] = { 1, 1, 1 }; tsk_id_t id_map[3]; tsk_id_t indexes[] = { 0, 1, 2 }; ret = tsk_site_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_site_table_add_row(&source, 0, ancestral_state, 1, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&source, 1, ancestral_state, 2, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&source, 2, ancestral_state, 3, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_site_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&t1, &source, 0)); ret = tsk_site_table_keep_rows(&t1, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&t1, &source, 0)); CU_ASSERT_EQUAL_FATAL(id_map[0], 0); CU_ASSERT_EQUAL_FATAL(id_map[1], 1); CU_ASSERT_EQUAL_FATAL(id_map[2], 2); keep[0] = 0; keep[1] = 0; keep[2] = 0; ret = tsk_site_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 0); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], -1); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_site_table_copy(&source, &t1, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = 0; keep[1] = 1; keep[2] = 0; ret = tsk_site_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 1); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], 0); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_site_table_get_row(&t1, 0, &row); CU_ASSERT_EQUAL_FATAL(row.position, 1); CU_ASSERT_EQUAL_FATAL(row.ancestral_state_length, 2); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[0], 'X'); CU_ASSERT_EQUAL_FATAL(row.ancestral_state[1], 'Y'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 2); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); tsk_site_table_free(&t1); keep[0] = 0; keep[1] = 0; keep[2] = 0; /* Keeping first n rows equivalent to truncate */ for (j = 0; j < source.num_rows; j++) { ret = tsk_site_table_copy(&source, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_truncate(&t1, j + 1); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[j] = 1; ret = tsk_site_table_keep_rows(&t2, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&t1, &t2, 0)); /* Adding the remaining rows back on to the table gives the original * table */ ret = tsk_site_table_extend( &t2, &source, source.num_rows - j - 1, indexes + j + 1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&source, &t2, 0)); tsk_site_table_free(&t1); tsk_site_table_free(&t2); } tsk_site_table_free(&source); } static void test_mutation_table(void) { int ret; tsk_id_t ret_id; tsk_mutation_table_t table, table2; tsk_size_t num_rows = 100; tsk_size_t max_len = 20; tsk_size_t k, len; tsk_id_t j; tsk_id_t *node; tsk_id_t *parent; tsk_id_t *site; double *time; char *derived_state, *metadata; char c[max_len + 1]; tsk_size_t *derived_state_offset, *metadata_offset; tsk_mutation_t mutation, mutation2; tsk_id_t row_subset[6] = { 1, 9, 1, 0, 2, 2 }; tsk_size_t num_row_subset = 6; for (j = 0; j < (tsk_id_t) max_len; j++) { c[j] = (char) ('A' + j); } ret = tsk_mutation_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_mutation_table_set_max_rows_increment(&table, 1); tsk_mutation_table_set_max_metadata_length_increment(&table, 1); tsk_mutation_table_set_max_derived_state_length_increment(&table, 1); tsk_mutation_table_print_state(&table, _devnull); ret = tsk_mutation_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); len = 0; for (j = 0; j < (tsk_id_t) num_rows; j++) { k = TSK_MIN((tsk_size_t) j + 1, max_len); ret_id = tsk_mutation_table_add_row(&table, j, j, j, (double) j, c, k, c, k); CU_ASSERT_EQUAL_FATAL(ret_id, j); CU_ASSERT_EQUAL(table.site[j], j); CU_ASSERT_EQUAL(table.node[j], j); CU_ASSERT_EQUAL(table.parent[j], j); CU_ASSERT_EQUAL(table.time[j], j); CU_ASSERT_EQUAL(table.derived_state_offset[j], len); CU_ASSERT_EQUAL(table.metadata_offset[j], len); CU_ASSERT_EQUAL(table.num_rows, (tsk_size_t) j + 1); len += k; CU_ASSERT_EQUAL(table.derived_state_offset[j + 1], len); CU_ASSERT_EQUAL(table.derived_state_length, len); CU_ASSERT_EQUAL(table.metadata_offset[j + 1], len); CU_ASSERT_EQUAL(table.metadata_length, len); ret = tsk_mutation_table_get_row(&table, (tsk_id_t) j, &mutation); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(mutation.id, j); CU_ASSERT_EQUAL(mutation.site, j); CU_ASSERT_EQUAL(mutation.node, j); CU_ASSERT_EQUAL(mutation.parent, j); CU_ASSERT_EQUAL(mutation.time, j); CU_ASSERT_EQUAL(mutation.metadata_length, k); CU_ASSERT_NSTRING_EQUAL(mutation.metadata, c, k); CU_ASSERT_EQUAL(mutation.derived_state_length, k); CU_ASSERT_NSTRING_EQUAL(mutation.derived_state, c, k); } ret = tsk_mutation_table_get_row(&table, (tsk_id_t) num_rows, &mutation); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); tsk_mutation_table_print_state(&table, _devnull); ret = tsk_mutation_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); num_rows *= 2; site = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(site != NULL); node = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(node != NULL); parent = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(parent != NULL); time = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(time != NULL); derived_state = tsk_malloc(num_rows * sizeof(char)); CU_ASSERT_FATAL(derived_state != NULL); derived_state_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(derived_state_offset != NULL); metadata = tsk_malloc(num_rows * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); for (j = 0; j < (tsk_id_t) num_rows; j++) { node[j] = j; site[j] = j + 1; parent[j] = j + 2; time[j] = (double) (j + 3); derived_state[j] = 'Y'; derived_state_offset[j] = (tsk_size_t) j; metadata[j] = 'M'; metadata_offset[j] = (tsk_size_t) j; } derived_state_offset[num_rows] = num_rows; metadata_offset[num_rows] = num_rows; ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.derived_state_length, num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); /* Append another num_rows */ ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.site + num_rows, site, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.node + num_rows, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parent + num_rows, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.time + num_rows, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.derived_state_length, 2 * num_rows); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.metadata_length, 2 * num_rows); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); /* Truncate back to num_rows */ ret = tsk_mutation_table_truncate(&table, num_rows); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.derived_state_length, num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); /* Test equality with and without metadata */ tsk_mutation_table_copy(&table, &table2, 0); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the metadata values */ table2.metadata[0] = 0; CU_ASSERT_FALSE(tsk_mutation_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the last metadata entry */ table2.metadata_offset[table2.num_rows] = table2.metadata_offset[table2.num_rows - 1]; CU_ASSERT_FALSE(tsk_mutation_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Delete all metadata */ tsk_memset(table2.metadata_offset, 0, (table2.num_rows + 1) * sizeof(*table2.metadata_offset)); CU_ASSERT_FALSE(tsk_mutation_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_mutation_table_free(&table2); ret = tsk_mutation_table_truncate(&table, num_rows + 1); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TABLE_POSITION); /* Check all this again, except with parent == NULL, time == NULL * and metadata == NULL. */ tsk_memset(parent, 0xff, num_rows * sizeof(tsk_id_t)); for (j = 0; j < (tsk_id_t) num_rows; j++) { time[j] = TSK_UNKNOWN_TIME; } tsk_memset(metadata_offset, 0, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, NULL, NULL, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.derived_state_offset, derived_state_offset, num_rows * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.derived_state_length, num_rows); CU_ASSERT_EQUAL(table.metadata_length, 0); /* Append another num_rows */ ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, NULL, NULL, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.site + num_rows, site, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.node + num_rows, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parent + num_rows, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.time + num_rows, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.derived_state + num_rows, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.derived_state_length, 2 * num_rows); CU_ASSERT_EQUAL(table.metadata_length, 0); /* Inputs except parent, time, metadata and metadata_offset cannot be NULL*/ ret = tsk_mutation_table_set_columns(&table, num_rows, NULL, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_set_columns(&table, num_rows, site, NULL, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, NULL, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Inputs except parent, time, metadata and metadata_offset cannot be NULL*/ ret = tsk_mutation_table_append_columns(&table, num_rows, NULL, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_append_columns(&table, num_rows, site, NULL, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, NULL, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, derived_state, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Test extend method */ for (j = 0; j < (tsk_id_t) num_rows; j++) { parent[j] = j + 2; time[j] = (double) (j + 3); metadata[j] = (char) ('A' + j); metadata_offset[j] = (tsk_size_t) j; } metadata_offset[num_rows] = num_rows; ret = tsk_mutation_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_init(&table2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Can't extend from self */ ret = tsk_mutation_table_extend(&table, &table, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANNOT_EXTEND_FROM_SELF); /* Two empty tables */ CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, 0)); ret = tsk_mutation_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, 0)); /* Row out of bounds */ ret = tsk_mutation_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); /* Num rows out of bounds */ ret = tsk_mutation_table_extend(&table, &table2, num_rows * 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); /* Copy rows in order if index NULL */ ret = tsk_mutation_table_set_columns(&table2, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_mutation_table_equals(&table, &table2, 0)); ret = tsk_mutation_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, 0)); /* Copy nothing if index not NULL but length zero */ ret = tsk_mutation_table_extend(&table, &table2, 0, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, 0)); /* Copy first N rows in order if index NULL */ ret = tsk_mutation_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_extend(&table, &table2, num_rows / 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_truncate(&table2, num_rows / 2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, 0)); ret = tsk_mutation_table_set_columns(&table2, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Copy a subset */ ret = tsk_mutation_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_mutation_table_equals(&table, &table2, 0)); ret = tsk_mutation_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (k = 0; k < num_row_subset; k++) { ret = tsk_mutation_table_get_row(&table, (tsk_id_t) k, &mutation); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_get_row(&table2, row_subset[k], &mutation2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(mutation.site, mutation2.site); CU_ASSERT_EQUAL(mutation.node, mutation2.node); CU_ASSERT_EQUAL(mutation.parent, mutation2.parent); CU_ASSERT_EQUAL(mutation.time, mutation2.time); CU_ASSERT_EQUAL(mutation.derived_state_length, mutation2.derived_state_length); CU_ASSERT_EQUAL(mutation.metadata_length, mutation2.metadata_length); CU_ASSERT_EQUAL( tsk_memcmp(mutation.derived_state, mutation2.derived_state, mutation.derived_state_length * sizeof(*mutation.derived_state)), 0); CU_ASSERT_EQUAL(tsk_memcmp(mutation.metadata, mutation2.metadata, mutation.metadata_length * sizeof(*mutation.metadata)), 0); } /* Test for bad offsets */ derived_state_offset[0] = 1; ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); derived_state_offset[0] = 0; derived_state_offset[num_rows] = 0; ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); ret = tsk_mutation_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(table.metadata_schema_length, 0); CU_ASSERT_EQUAL(table.metadata_schema, NULL); const char *example = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_length = (tsk_size_t) strlen(example); const char *example2 = "A different example 🎄🌳🌴🌲🎋"; tsk_size_t example2_length = (tsk_size_t) strlen(example); tsk_mutation_table_set_metadata_schema(&table, example, example_length); CU_ASSERT_EQUAL(table.metadata_schema_length, example_length); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_schema, example, example_length), 0); tsk_mutation_table_copy(&table, &table2, TSK_NO_INIT); CU_ASSERT_EQUAL(table.metadata_schema_length, table2.metadata_schema_length); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_schema, table2.metadata_schema, example_length), 0); tsk_mutation_table_set_metadata_schema(&table2, example, example_length); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, 0)); tsk_mutation_table_set_metadata_schema(&table2, example2, example2_length); CU_ASSERT_FALSE(tsk_mutation_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_mutation_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_mutation_table_clear(&table); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.derived_state_length, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); tsk_mutation_table_free(&table); CU_ASSERT_EQUAL(ret, 0); tsk_mutation_table_free(&table2); CU_ASSERT_EQUAL(ret, 0); free(site); free(node); free(parent); free(time); free(derived_state); free(derived_state_offset); free(metadata); free(metadata_offset); } static void test_mutation_table_takeset(void) { int ret = 0; tsk_id_t ret_id; tsk_mutation_table_t source_table, table; tsk_size_t num_rows = 100; tsk_id_t j; tsk_id_t *site; tsk_id_t *node; tsk_id_t *parent; double *time; char *derived_state; tsk_size_t *derived_state_offset; char *metadata; tsk_size_t *metadata_offset; const char *test_derived_state = "red"; tsk_size_t test_derived_state_length = 3; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; tsk_size_t zeros[num_rows + 1]; tsk_id_t neg_ones[num_rows]; double unknown_times[num_rows]; tsk_memset(zeros, 0, (num_rows + 1) * sizeof(tsk_size_t)); tsk_memset(neg_ones, 0xff, num_rows * sizeof(tsk_id_t)); /* Make a table to copy from */ ret = tsk_mutation_table_init(&source_table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_rows; j++) { unknown_times[j] = TSK_UNKNOWN_TIME; ret_id = tsk_mutation_table_add_row(&source_table, j, j + 1, j + 2, (double) j + 3, test_derived_state, test_derived_state_length, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, j); } /* Prepare arrays to be taken */ site = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(site != NULL); tsk_memcpy(site, source_table.site, num_rows * sizeof(tsk_id_t)); node = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(node != NULL); tsk_memcpy(node, source_table.node, num_rows * sizeof(tsk_id_t)); parent = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(parent != NULL); tsk_memcpy(parent, source_table.parent, num_rows * sizeof(tsk_id_t)); time = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(time != NULL); tsk_memcpy(time, source_table.time, num_rows * sizeof(double)); derived_state = tsk_malloc(num_rows * test_derived_state_length * sizeof(char)); CU_ASSERT_FATAL(derived_state != NULL); tsk_memcpy(derived_state, source_table.derived_state, num_rows * test_derived_state_length * sizeof(char)); derived_state_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(derived_state_offset != NULL); tsk_memcpy(derived_state_offset, source_table.derived_state_offset, (num_rows + 1) * sizeof(tsk_size_t)); metadata = tsk_malloc(num_rows * test_metadata_length * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); tsk_memcpy( metadata, source_table.metadata, num_rows * test_metadata_length * sizeof(char)); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); tsk_memcpy(metadata_offset, source_table.metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_mutation_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add one row so that we can check takeset frees it */ ret_id = tsk_mutation_table_add_row(&table, 1, 1, 1, 1, test_derived_state, test_derived_state_length, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_mutation_table_takeset_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_mutation_table_equals(&source_table, &table, 0)); /* Test error states, all of these must not take the array, or free existing */ /* metadata and metadata offset must be simultaneously NULL or not */ ret = tsk_mutation_table_takeset_columns(&table, num_rows, NULL, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_takeset_columns(&table, num_rows, site, NULL, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); // Parent and time not tested as they have deafults ret = tsk_mutation_table_takeset_columns(&table, num_rows, site, node, parent, time, NULL, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_takeset_columns(&table, num_rows, site, node, parent, time, derived_state, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_takeset_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_takeset_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Check error on bad derived_state offset */ derived_state_offset[0] = 1; ret = tsk_mutation_table_takeset_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); /* Truncation after takeset keeps memory and max_rows */ ret = tsk_mutation_table_clear(&table); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(table.max_rows, num_rows); // Re init non-optional arrays site = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(site != NULL); tsk_memcpy(site, source_table.site, num_rows * sizeof(tsk_id_t)); node = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(node != NULL); tsk_memcpy(node, source_table.node, num_rows * sizeof(tsk_id_t)); derived_state = tsk_malloc(num_rows * test_derived_state_length * sizeof(char)); CU_ASSERT_FATAL(derived_state != NULL); tsk_memcpy(derived_state, source_table.derived_state, num_rows * test_derived_state_length * sizeof(char)); derived_state_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(derived_state_offset != NULL); tsk_memcpy(derived_state_offset, source_table.derived_state_offset, (num_rows + 1) * sizeof(tsk_size_t)); /* if metadata and offset are both null, all entries are zero length, if parent or * time are NULL they default to null values*/ num_rows = 10; ret = tsk_mutation_table_takeset_columns(&table, num_rows, site, node, NULL, NULL, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(tsk_memcmp(table.parent, neg_ones, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.time, unknown_times, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.metadata_length, 0); ret = tsk_mutation_table_free(&table); CU_ASSERT_EQUAL(ret, 0); ret = tsk_mutation_table_free(&source_table); CU_ASSERT_EQUAL(ret, 0); } static void test_mutation_table_update_row(void) { int ret; tsk_id_t ret_id; tsk_mutation_table_t table; tsk_mutation_t row; const char *derived_state = "XYZ"; const char *metadata = "ABC"; ret = tsk_mutation_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row(&table, 0, 1, 2, 3, derived_state, 1, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&table, 1, 2, 3, 4, derived_state, 2, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&table, 2, 3, 4, 5, derived_state, 3, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_mutation_table_update_row( &table, 0, 1, 2, 3, 4, &derived_state[1], 1, &metadata[1], 1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.site, 1); CU_ASSERT_EQUAL_FATAL(row.node, 2); CU_ASSERT_EQUAL_FATAL(row.parent, 3); CU_ASSERT_EQUAL_FATAL(row.time, 4); CU_ASSERT_EQUAL_FATAL(row.derived_state_length, 1); CU_ASSERT_EQUAL_FATAL(row.derived_state[0], 'Y'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_mutation_table_update_row(&table, 0, row.site + 1, row.node + 1, row.parent + 1, row.time + 1, row.derived_state, row.derived_state_length, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.site, 2); CU_ASSERT_EQUAL_FATAL(row.node, 3); CU_ASSERT_EQUAL_FATAL(row.parent, 4); CU_ASSERT_EQUAL_FATAL(row.time, 5); CU_ASSERT_EQUAL_FATAL(row.derived_state_length, 1); CU_ASSERT_EQUAL_FATAL(row.derived_state[0], 'Y'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_mutation_table_update_row(&table, 0, row.site, row.node, row.parent, row.time, row.derived_state, row.derived_state_length, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.site, 2); CU_ASSERT_EQUAL_FATAL(row.node, 3); CU_ASSERT_EQUAL_FATAL(row.parent, 4); CU_ASSERT_EQUAL_FATAL(row.time, 5); CU_ASSERT_EQUAL_FATAL(row.derived_state_length, 1); CU_ASSERT_EQUAL_FATAL(row.derived_state[0], 'Y'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_mutation_table_update_row(&table, 0, row.site, row.node, row.parent, row.time, NULL, 0, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.site, 2); CU_ASSERT_EQUAL_FATAL(row.node, 3); CU_ASSERT_EQUAL_FATAL(row.parent, 4); CU_ASSERT_EQUAL_FATAL(row.time, 5); CU_ASSERT_EQUAL_FATAL(row.derived_state_length, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_mutation_table_update_row( &table, 0, 2, 3, 4, 5, derived_state, 3, metadata, 3); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.site, 2); CU_ASSERT_EQUAL_FATAL(row.node, 3); CU_ASSERT_EQUAL_FATAL(row.parent, 4); CU_ASSERT_EQUAL_FATAL(row.time, 5); CU_ASSERT_EQUAL_FATAL(row.derived_state_length, 3); CU_ASSERT_EQUAL_FATAL(row.derived_state[0], 'X'); CU_ASSERT_EQUAL_FATAL(row.derived_state[1], 'Y'); CU_ASSERT_EQUAL_FATAL(row.derived_state[2], 'Z'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_mutation_table_update_row(&table, 1, 5, 6, 7, 8, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_get_row(&table, 1, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.site, 5); CU_ASSERT_EQUAL_FATAL(row.node, 6); CU_ASSERT_EQUAL_FATAL(row.parent, 7); CU_ASSERT_EQUAL_FATAL(row.time, 8); CU_ASSERT_EQUAL_FATAL(row.derived_state_length, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 0); ret = tsk_mutation_table_get_row(&table, 2, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.site, 2); CU_ASSERT_EQUAL_FATAL(row.node, 3); CU_ASSERT_EQUAL_FATAL(row.parent, 4); CU_ASSERT_EQUAL_FATAL(row.time, 5); CU_ASSERT_EQUAL_FATAL(row.derived_state_length, 3); CU_ASSERT_EQUAL_FATAL(row.derived_state[0], 'X'); CU_ASSERT_EQUAL_FATAL(row.derived_state[1], 'Y'); CU_ASSERT_EQUAL_FATAL(row.derived_state[2], 'Z'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_mutation_table_update_row(&table, 3, 0, 0, 0, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); tsk_mutation_table_free(&table); } static void test_mutation_table_keep_rows(void) { int ret; tsk_id_t ret_id; tsk_size_t j; tsk_mutation_table_t source, t1, t2; tsk_mutation_t row; const char *derived_state = "XYZ"; const char *metadata = "ABC"; tsk_bool_t keep[3] = { 1, 1, 1 }; tsk_id_t id_map[3]; tsk_id_t indexes[] = { 0, 1, 2 }; ret = tsk_mutation_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &source, 0, 1, -1, 3.0, derived_state, 1, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &source, 1, 2, -1, 4.0, derived_state, 2, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &source, 2, 3, 0, 5.0, derived_state, 3, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_mutation_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_mutation_table_equals(&t1, &source, 0)); ret = tsk_mutation_table_keep_rows(&t1, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_mutation_table_equals(&t1, &source, 0)); CU_ASSERT_EQUAL_FATAL(id_map[0], 0); CU_ASSERT_EQUAL_FATAL(id_map[1], 1); CU_ASSERT_EQUAL_FATAL(id_map[2], 2); keep[0] = 0; keep[1] = 0; keep[2] = 0; ret = tsk_mutation_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 0); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], -1); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_mutation_table_copy(&source, &t1, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = 0; keep[1] = 1; keep[2] = 0; ret = tsk_mutation_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 1); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], 0); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_mutation_table_get_row(&t1, 0, &row); CU_ASSERT_EQUAL_FATAL(row.site, 1); CU_ASSERT_EQUAL_FATAL(row.node, 2); CU_ASSERT_EQUAL_FATAL(row.parent, -1); CU_ASSERT_EQUAL_FATAL(row.time, 4); CU_ASSERT_EQUAL_FATAL(row.derived_state_length, 2); CU_ASSERT_EQUAL_FATAL(row.derived_state[0], 'X'); CU_ASSERT_EQUAL_FATAL(row.derived_state[1], 'Y'); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 2); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); tsk_mutation_table_free(&t1); keep[0] = 0; keep[1] = 0; keep[2] = 0; /* Keeping first n rows equivalent to truncate */ for (j = 0; j < source.num_rows; j++) { ret = tsk_mutation_table_copy(&source, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_truncate(&t1, j + 1); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[j] = 1; ret = tsk_mutation_table_keep_rows(&t2, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_mutation_table_equals(&t1, &t2, 0)); /* Adding the remaining rows back on to the table gives the original * table */ ret = tsk_mutation_table_extend( &t2, &source, source.num_rows - j - 1, indexes + j + 1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_mutation_table_equals(&source, &t2, 0)); tsk_mutation_table_free(&t1); tsk_mutation_table_free(&t2); } tsk_mutation_table_free(&source); } static void test_mutation_table_keep_rows_parent_references(void) { int ret; tsk_id_t ret_id; tsk_mutation_table_t source, t; tsk_bool_t keep[4] = { 1, 1, 1, 1 }; tsk_id_t id_map[4]; ret = tsk_mutation_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row(&source, 0, 1, -1, 3.0, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&source, 1, 2, -1, 4.0, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&source, 2, 3, 1, 5.0, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&source, 3, 4, 1, 6.0, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_mutation_table_copy(&source, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* OOB errors */ t.parent[0] = -2; ret = tsk_mutation_table_keep_rows(&t, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(t.num_rows, 4); t.parent[0] = 4; ret = tsk_mutation_table_keep_rows(&t, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(t.num_rows, 4); /* But ignored if row is not kept */ keep[0] = false; ret = tsk_mutation_table_keep_rows(&t, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_mutation_table_free(&t); ret = tsk_mutation_table_copy(&source, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Try to remove referenced row 1 */ keep[0] = true; keep[1] = false; ret = tsk_mutation_table_keep_rows(&t, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_KEEP_ROWS_MAP_TO_DELETED); CU_ASSERT_TRUE(tsk_mutation_table_equals(&source, &t, 0)); tsk_mutation_table_free(&t); ret = tsk_mutation_table_copy(&source, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* remove unreferenced row 0 */ keep[0] = false; keep[1] = true; ret = tsk_mutation_table_keep_rows(&t, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.num_rows, 3); CU_ASSERT_EQUAL_FATAL(t.parent[0], TSK_NULL); CU_ASSERT_EQUAL_FATAL(t.parent[1], 0); CU_ASSERT_EQUAL_FATAL(t.parent[2], 0); tsk_mutation_table_free(&t); /* Check that we don't change the table in error cases. */ source.parent[3] = -2; ret = tsk_mutation_table_copy(&source, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = true; ret = tsk_mutation_table_keep_rows(&t, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); CU_ASSERT_TRUE(tsk_mutation_table_equals(&source, &t, 0)); tsk_mutation_table_free(&t); /* Check that we don't change the table in error cases. */ source.parent[3] = 0; ret = tsk_mutation_table_copy(&source, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = false; ret = tsk_mutation_table_keep_rows(&t, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_KEEP_ROWS_MAP_TO_DELETED); CU_ASSERT_TRUE(tsk_mutation_table_equals(&source, &t, 0)); tsk_mutation_table_free(&t); tsk_mutation_table_free(&source); } static void test_migration_table(void) { int ret; tsk_id_t ret_id; tsk_migration_table_t table, table2; tsk_size_t num_rows = 100; tsk_id_t j; tsk_id_t *node; tsk_id_t *source, *dest; double *left, *right, *time; tsk_migration_t migration, migration2; char *metadata; tsk_size_t *metadata_offset; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; char metadata_copy[test_metadata_length + 1]; tsk_id_t row_subset[6] = { 1, 9, 1, 0, 2, 2 }; tsk_size_t num_row_subset = 6; metadata_copy[test_metadata_length] = '\0'; ret = tsk_migration_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_migration_table_set_max_rows_increment(&table, 1); tsk_migration_table_print_state(&table, _devnull); ret = tsk_migration_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_rows; j++) { ret_id = tsk_migration_table_add_row(&table, (double) j, (double) j, j, j, j, (double) j, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, j); CU_ASSERT_EQUAL(table.left[j], j); CU_ASSERT_EQUAL(table.right[j], j); CU_ASSERT_EQUAL(table.node[j], j); CU_ASSERT_EQUAL(table.source[j], j); CU_ASSERT_EQUAL(table.dest[j], j); CU_ASSERT_EQUAL(table.time[j], j); CU_ASSERT_EQUAL(table.num_rows, (tsk_size_t) j + 1); CU_ASSERT_EQUAL( table.metadata_length, (tsk_size_t) (j + 1) * test_metadata_length); CU_ASSERT_EQUAL(table.metadata_offset[j + 1], table.metadata_length); /* check the metadata */ tsk_memcpy(metadata_copy, table.metadata + table.metadata_offset[j], test_metadata_length); CU_ASSERT_NSTRING_EQUAL(metadata_copy, test_metadata, test_metadata_length); ret = tsk_migration_table_get_row(&table, (tsk_id_t) j, &migration); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(migration.id, j); CU_ASSERT_EQUAL(migration.left, j); CU_ASSERT_EQUAL(migration.right, j); CU_ASSERT_EQUAL(migration.node, j); CU_ASSERT_EQUAL(migration.source, j); CU_ASSERT_EQUAL(migration.dest, j); CU_ASSERT_EQUAL(migration.time, j); CU_ASSERT_EQUAL(migration.metadata_length, test_metadata_length); CU_ASSERT_NSTRING_EQUAL(migration.metadata, test_metadata, test_metadata_length); } ret = tsk_migration_table_get_row(&table, (tsk_id_t) num_rows, &migration); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); tsk_migration_table_print_state(&table, _devnull); ret = tsk_migration_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); num_rows *= 2; left = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(left != NULL); tsk_memset(left, 1, num_rows * sizeof(double)); right = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(right != NULL); tsk_memset(right, 2, num_rows * sizeof(double)); time = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(time != NULL); tsk_memset(time, 3, num_rows * sizeof(double)); node = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(node != NULL); tsk_memset(node, 4, num_rows * sizeof(tsk_id_t)); source = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(source != NULL); tsk_memset(source, 5, num_rows * sizeof(tsk_id_t)); dest = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(dest != NULL); tsk_memset(dest, 6, num_rows * sizeof(tsk_id_t)); metadata = tsk_malloc(num_rows * sizeof(char)); tsk_memset(metadata, 'a', num_rows * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); for (j = 0; j < (tsk_id_t) num_rows + 1; j++) { metadata_offset[j] = (tsk_size_t) j; } ret = tsk_migration_table_set_columns(&table, num_rows, left, right, node, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.left, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.right, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.source, source, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.dest, dest, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); /* Append another num_rows */ ret = tsk_migration_table_append_columns(&table, num_rows, left, right, node, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.left, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.left + num_rows, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.right, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.right + num_rows, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.time + num_rows, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.node + num_rows, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.source, source, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.source + num_rows, source, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.dest, dest, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.dest + num_rows, dest, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata + num_rows, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.metadata_length, 2 * num_rows); /* Truncate back to num_rows */ ret = tsk_migration_table_truncate(&table, num_rows); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.left, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.right, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.source, source, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.dest, dest, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); /* Test equality with and without metadata */ tsk_migration_table_copy(&table, &table2, 0); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the metadata values */ table2.metadata[0] = 0; CU_ASSERT_FALSE(tsk_migration_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the last metadata entry */ table2.metadata_offset[table2.num_rows] = table2.metadata_offset[table2.num_rows - 1]; CU_ASSERT_FALSE(tsk_migration_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Delete all metadata */ tsk_memset(table2.metadata_offset, 0, (table2.num_rows + 1) * sizeof(*table2.metadata_offset)); CU_ASSERT_FALSE(tsk_migration_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_migration_table_free(&table2); ret = tsk_migration_table_truncate(&table, num_rows + 1); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TABLE_POSITION); /* inputs cannot be NULL */ ret = tsk_migration_table_set_columns(&table, num_rows, NULL, right, node, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_set_columns(&table, num_rows, left, NULL, node, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_set_columns(&table, num_rows, left, right, NULL, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_set_columns(&table, num_rows, left, right, node, NULL, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_set_columns(&table, num_rows, left, right, node, source, NULL, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_set_columns(&table, num_rows, left, right, node, source, dest, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_set_columns( &table, num_rows, left, right, node, source, dest, time, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_set_columns( &table, num_rows, left, right, node, source, dest, time, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_migration_table_clear(&table); CU_ASSERT_EQUAL(table.num_rows, 0); /* if metadata and metadata_offset are both null, all metadatas are zero length */ num_rows = 10; tsk_memset(metadata_offset, 0, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_migration_table_set_columns( &table, num_rows, left, right, node, source, dest, time, NULL, NULL); CU_ASSERT_EQUAL(tsk_memcmp(table.left, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.right, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.source, source, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.dest, dest, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.metadata_length, 0); ret = tsk_migration_table_append_columns( &table, num_rows, left, right, node, source, dest, time, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.left, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.left + num_rows, left, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.right, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.right + num_rows, right, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.time + num_rows, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.node + num_rows, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.source, source, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.source + num_rows, source, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.dest, dest, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.dest + num_rows, dest, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset + num_rows, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.metadata_length, 0); tsk_migration_table_print_state(&table, _devnull); ret = tsk_migration_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Test extend method */ ret = tsk_migration_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_init(&table2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Can't extend from self */ ret = tsk_migration_table_extend(&table, &table, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANNOT_EXTEND_FROM_SELF); /* Two empty tables */ CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, 0)); ret = tsk_migration_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, 0)); /* Row out of bounds */ ret = tsk_migration_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); /* Num rows out of bounds */ ret = tsk_migration_table_extend(&table, &table2, num_rows * 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); /* Copy rows in order if index NULL */ ret = tsk_migration_table_set_columns(&table2, num_rows, left, right, node, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_migration_table_equals(&table, &table2, 0)); ret = tsk_migration_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, 0)); /* Copy nothing if index not NULL but length zero */ ret = tsk_migration_table_extend(&table, &table2, 0, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, 0)); /* Copy first N rows in order if index NULL */ ret = tsk_migration_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_extend(&table, &table2, num_rows / 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_truncate(&table2, num_rows / 2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, 0)); ret = tsk_migration_table_set_columns(&table2, num_rows, left, right, node, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Copy a subset */ ret = tsk_migration_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_migration_table_equals(&table, &table2, 0)); ret = tsk_migration_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_row_subset; j++) { ret = tsk_migration_table_get_row(&table, j, &migration); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_get_row(&table2, row_subset[j], &migration2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(migration.source, migration2.source); CU_ASSERT_EQUAL(migration.dest, migration2.dest); CU_ASSERT_EQUAL(migration.node, migration2.node); CU_ASSERT_EQUAL(migration.left, migration2.left); CU_ASSERT_EQUAL(migration.right, migration2.right); CU_ASSERT_EQUAL(migration.time, migration2.time); CU_ASSERT_EQUAL(migration.metadata_length, migration2.metadata_length); CU_ASSERT_EQUAL(tsk_memcmp(migration.metadata, migration2.metadata, migration.metadata_length * sizeof(*migration.metadata)), 0); } ret = tsk_migration_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(table.metadata_schema_length, 0); CU_ASSERT_EQUAL(table.metadata_schema, NULL); const char *example = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_length = (tsk_size_t) strlen(example); const char *example2 = "A different example 🎄🌳🌴🌲🎋"; tsk_size_t example2_length = (tsk_size_t) strlen(example); tsk_migration_table_set_metadata_schema(&table, example, example_length); CU_ASSERT_EQUAL(table.metadata_schema_length, example_length); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_schema, example, example_length), 0); tsk_migration_table_copy(&table, &table2, TSK_NO_INIT); CU_ASSERT_EQUAL(table.metadata_schema_length, table2.metadata_schema_length); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_schema, table2.metadata_schema, example_length), 0); tsk_migration_table_set_metadata_schema(&table2, example, example_length); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, 0)); tsk_migration_table_set_metadata_schema(&table2, example2, example2_length); CU_ASSERT_FALSE(tsk_migration_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE(tsk_migration_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_migration_table_clear(&table); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); tsk_migration_table_free(&table); CU_ASSERT_EQUAL(ret, 0); tsk_migration_table_free(&table2); CU_ASSERT_EQUAL(ret, 0); free(left); free(right); free(time); free(node); free(source); free(dest); free(metadata); free(metadata_offset); } static void test_migration_table_takeset(void) { int ret = 0; tsk_id_t ret_id; tsk_migration_table_t source_table, table; tsk_size_t num_rows = 100; tsk_id_t j; double *left; double *right; tsk_id_t *node; tsk_id_t *source; tsk_id_t *dest; double *time; char *metadata; tsk_size_t *metadata_offset; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; tsk_size_t zeros[num_rows + 1]; tsk_memset(zeros, 0, (num_rows + 1) * sizeof(tsk_size_t)); /* Make a table to copy from */ ret = tsk_migration_table_init(&source_table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_rows; j++) { ret_id = tsk_migration_table_add_row(&source_table, (double) j, (double) j + 1, j + 2, j + 3, j + 4, (double) j + 5, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, j); } /* Prepare arrays to be taken */ left = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(left != NULL); tsk_memcpy(left, source_table.left, num_rows * sizeof(double)); right = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(right != NULL); tsk_memcpy(right, source_table.right, num_rows * sizeof(double)); node = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(node != NULL); tsk_memcpy(node, source_table.node, num_rows * sizeof(tsk_id_t)); source = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(source != NULL); tsk_memcpy(source, source_table.source, num_rows * sizeof(tsk_id_t)); dest = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(dest != NULL); tsk_memcpy(dest, source_table.dest, num_rows * sizeof(tsk_id_t)); time = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(time != NULL); tsk_memcpy(time, source_table.time, num_rows * sizeof(double)); metadata = tsk_malloc(num_rows * test_metadata_length * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); tsk_memcpy( metadata, source_table.metadata, num_rows * test_metadata_length * sizeof(char)); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); tsk_memcpy(metadata_offset, source_table.metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_migration_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add one row so that we can check takeset frees it */ ret_id = tsk_migration_table_add_row( &table, 1, 1, 1, 1, 1, 1, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_migration_table_takeset_columns(&table, num_rows, left, right, node, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_migration_table_equals(&source_table, &table, 0)); /* Test error states, all of these must not take the array, or free existing */ /* metadata and metadata offset must be simultaneously NULL or not */ ret = tsk_migration_table_takeset_columns(&table, num_rows, NULL, right, node, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_takeset_columns(&table, num_rows, left, NULL, node, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_takeset_columns(&table, num_rows, left, right, NULL, source, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_takeset_columns(&table, num_rows, left, right, node, NULL, dest, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_takeset_columns(&table, num_rows, left, right, node, source, NULL, time, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_takeset_columns(&table, num_rows, left, right, node, source, dest, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_takeset_columns( &table, num_rows, left, right, node, source, dest, time, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_migration_table_takeset_columns( &table, num_rows, left, right, node, source, dest, time, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Truncation after takeset keeps memory and max_rows */ ret = tsk_migration_table_clear(&table); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(table.max_rows, num_rows); // Re init non-optional arrays left = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(left != NULL); tsk_memcpy(left, source_table.left, num_rows * sizeof(double)); right = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(right != NULL); tsk_memcpy(right, source_table.right, num_rows * sizeof(double)); node = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(node != NULL); tsk_memcpy(node, source_table.node, num_rows * sizeof(tsk_id_t)); source = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(source != NULL); tsk_memcpy(source, source_table.source, num_rows * sizeof(tsk_id_t)); dest = tsk_malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(dest != NULL); tsk_memcpy(dest, source_table.dest, num_rows * sizeof(tsk_id_t)); time = tsk_malloc(num_rows * sizeof(double)); CU_ASSERT_FATAL(time != NULL); tsk_memcpy(time, source_table.time, num_rows * sizeof(double)); /* if metadata and offset are both null, all entries are zero length */ num_rows = 10; ret = tsk_migration_table_takeset_columns( &table, num_rows, left, right, node, source, dest, time, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.metadata_length, 0); ret = tsk_migration_table_free(&table); CU_ASSERT_EQUAL(ret, 0); ret = tsk_migration_table_free(&source_table); CU_ASSERT_EQUAL(ret, 0); } static void test_migration_table_update_row(void) { int ret; tsk_id_t ret_id; tsk_migration_table_t table; tsk_migration_t row; const char *metadata = "ABC"; ret = tsk_migration_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row(&table, 0, 1.0, 2, 3, 4, 5, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_migration_table_add_row(&table, 1, 2.0, 3, 4, 5, 6, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_migration_table_add_row(&table, 2, 3.0, 4, 5, 6, 7, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_migration_table_update_row(&table, 0, 1, 2.0, 3, 4, 5, 6, &metadata[1], 1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 1); CU_ASSERT_EQUAL_FATAL(row.right, 2.0); CU_ASSERT_EQUAL_FATAL(row.node, 3); CU_ASSERT_EQUAL_FATAL(row.source, 4); CU_ASSERT_EQUAL_FATAL(row.dest, 5); CU_ASSERT_EQUAL_FATAL(row.time, 6); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_migration_table_update_row(&table, 0, row.left + 1, row.right + 1, row.node + 1, row.source + 1, row.dest + 1, row.time + 1, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 2); CU_ASSERT_EQUAL_FATAL(row.right, 3.0); CU_ASSERT_EQUAL_FATAL(row.node, 4); CU_ASSERT_EQUAL_FATAL(row.source, 5); CU_ASSERT_EQUAL_FATAL(row.dest, 6); CU_ASSERT_EQUAL_FATAL(row.time, 7); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_migration_table_update_row(&table, 0, 0, 0, 0, 0, 0, 0, metadata, 3); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 0); CU_ASSERT_EQUAL_FATAL(row.right, 0); CU_ASSERT_EQUAL_FATAL(row.node, 0); CU_ASSERT_EQUAL_FATAL(row.source, 0); CU_ASSERT_EQUAL_FATAL(row.dest, 0); CU_ASSERT_EQUAL_FATAL(row.time, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_migration_table_update_row(&table, 1, 0, 0, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_get_row(&table, 1, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 0); CU_ASSERT_EQUAL_FATAL(row.right, 0); CU_ASSERT_EQUAL_FATAL(row.node, 0); CU_ASSERT_EQUAL_FATAL(row.source, 0); CU_ASSERT_EQUAL_FATAL(row.dest, 0); CU_ASSERT_EQUAL_FATAL(row.time, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 0); ret = tsk_migration_table_get_row(&table, 2, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.left, 2); CU_ASSERT_EQUAL_FATAL(row.right, 3.0); CU_ASSERT_EQUAL_FATAL(row.node, 4); CU_ASSERT_EQUAL_FATAL(row.source, 5); CU_ASSERT_EQUAL_FATAL(row.dest, 6); CU_ASSERT_EQUAL_FATAL(row.time, 7); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_migration_table_update_row(&table, 3, 0, 0, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); tsk_migration_table_free(&table); } static void test_migration_table_keep_rows(void) { int ret; tsk_id_t ret_id; tsk_size_t j; tsk_migration_table_t source, t1, t2; tsk_migration_t row; const char *metadata = "ABC"; tsk_bool_t keep[3] = { 1, 1, 1 }; tsk_id_t id_map[3]; tsk_id_t indexes[] = { 0, 1, 2 }; ret = tsk_migration_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row(&source, 0, 1.0, 2, 3, 4, 5, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_migration_table_add_row(&source, 1, 2.0, 3, 4, 5, 6, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_migration_table_add_row(&source, 2, 3.0, 4, 5, 6, 7, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_migration_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_migration_table_equals(&t1, &source, 0)); ret = tsk_migration_table_keep_rows(&t1, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_migration_table_equals(&t1, &source, 0)); CU_ASSERT_EQUAL_FATAL(id_map[0], 0); CU_ASSERT_EQUAL_FATAL(id_map[1], 1); CU_ASSERT_EQUAL_FATAL(id_map[2], 2); keep[0] = 0; keep[1] = 0; keep[2] = 0; ret = tsk_migration_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 0); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], -1); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_migration_table_copy(&source, &t1, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = 0; keep[1] = 1; keep[2] = 0; ret = tsk_migration_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 1); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], 0); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_migration_table_get_row(&t1, 0, &row); CU_ASSERT_EQUAL_FATAL(row.left, 1); CU_ASSERT_EQUAL_FATAL(row.right, 2); CU_ASSERT_EQUAL_FATAL(row.node, 3); CU_ASSERT_EQUAL_FATAL(row.source, 4); CU_ASSERT_EQUAL_FATAL(row.dest, 5); CU_ASSERT_EQUAL_FATAL(row.time, 6); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 2); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); tsk_migration_table_free(&t1); keep[0] = 0; keep[1] = 0; keep[2] = 0; /* Keeping first n rows equivalent to truncate */ for (j = 0; j < source.num_rows; j++) { ret = tsk_migration_table_copy(&source, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_truncate(&t1, j + 1); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[j] = 1; ret = tsk_migration_table_keep_rows(&t2, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_migration_table_equals(&t1, &t2, 0)); /* Adding the remaining rows back on to the table gives the original * table */ ret = tsk_migration_table_extend( &t2, &source, source.num_rows - j - 1, indexes + j + 1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_migration_table_equals(&source, &t2, 0)); tsk_migration_table_free(&t1); tsk_migration_table_free(&t2); } tsk_migration_table_free(&source); } static void test_individual_table(void) { int ret = 0; tsk_id_t ret_id; tsk_individual_table_t table, table2; tsk_size_t num_rows = 100; tsk_id_t j; tsk_size_t k; tsk_flags_t *flags; double *location; tsk_id_t *parents; char *metadata; tsk_size_t *metadata_offset; tsk_size_t *parents_offset; tsk_size_t *location_offset; tsk_individual_t individual; tsk_individual_t individual2; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; char metadata_copy[test_metadata_length + 1]; tsk_size_t spatial_dimension = 2; tsk_size_t num_parents = 3; double test_location[spatial_dimension]; tsk_id_t test_parents[num_parents]; tsk_size_t zeros[num_rows + 1]; tsk_id_t row_subset[6] = { 1, 9, 1, 0, 2, 2 }; tsk_size_t num_row_subset = 6; tsk_memset(zeros, 0, (num_rows + 1) * sizeof(tsk_size_t)); for (k = 0; k < spatial_dimension; k++) { test_location[k] = (double) k; } for (k = 0; k < num_parents; k++) { test_parents[k] = (tsk_id_t) k + 42; } metadata_copy[test_metadata_length] = '\0'; ret = tsk_individual_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_individual_table_set_max_rows_increment(&table, 1); tsk_individual_table_set_max_metadata_length_increment(&table, 1); tsk_individual_table_set_max_location_length_increment(&table, 1); tsk_individual_table_set_max_parents_length_increment(&table, 1); tsk_individual_table_print_state(&table, _devnull); for (j = 0; j < (tsk_id_t) num_rows; j++) { ret_id = tsk_individual_table_add_row(&table, (tsk_flags_t) j, test_location, spatial_dimension, test_parents, num_parents, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, j); CU_ASSERT_EQUAL(table.flags[j], (tsk_flags_t) j); for (k = 0; k < spatial_dimension; k++) { test_location[k] = (double) k; CU_ASSERT_EQUAL( table.location[spatial_dimension * (size_t) j + k], test_location[k]); } CU_ASSERT_EQUAL( table.metadata_length, (tsk_size_t) (j + 1) * test_metadata_length); CU_ASSERT_EQUAL(table.metadata_offset[j + 1], table.metadata_length); /* check the metadata */ tsk_memcpy(metadata_copy, table.metadata + table.metadata_offset[j], test_metadata_length); CU_ASSERT_NSTRING_EQUAL(metadata_copy, test_metadata, test_metadata_length); ret = tsk_individual_table_get_row(&table, (tsk_id_t) j, &individual); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(individual.id, j); CU_ASSERT_EQUAL(individual.flags, (tsk_flags_t) j); CU_ASSERT_EQUAL(individual.location_length, spatial_dimension); CU_ASSERT_NSTRING_EQUAL( individual.location, test_location, spatial_dimension * sizeof(double)); CU_ASSERT_EQUAL(individual.metadata_length, test_metadata_length); CU_ASSERT_NSTRING_EQUAL( individual.metadata, test_metadata, test_metadata_length); } /* Test equality with and without metadata */ tsk_individual_table_copy(&table, &table2, 0); CU_ASSERT_TRUE(tsk_individual_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_individual_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the metadata values */ table2.metadata[0] = 0; CU_ASSERT_FALSE(tsk_individual_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_individual_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the last metadata entry */ table2.metadata_offset[table2.num_rows] = table2.metadata_offset[table2.num_rows - 1]; CU_ASSERT_FALSE(tsk_individual_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_individual_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Delete all metadata */ tsk_memset(table2.metadata_offset, 0, (table2.num_rows + 1) * sizeof(*table2.metadata_offset)); CU_ASSERT_FALSE(tsk_individual_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_individual_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_individual_table_free(&table2); ret = tsk_individual_table_get_row(&table, (tsk_id_t) num_rows, &individual); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); tsk_individual_table_print_state(&table, _devnull); tsk_individual_table_clear(&table); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); num_rows *= 2; flags = tsk_malloc(num_rows * sizeof(tsk_flags_t)); CU_ASSERT_FATAL(flags != NULL); for (k = 0; k < num_rows; k++) { flags[k] = (tsk_flags_t) (k + num_rows); } location = tsk_malloc(spatial_dimension * num_rows * sizeof(double)); CU_ASSERT_FATAL(location != NULL); for (k = 0; k < spatial_dimension * num_rows; k++) { location[k] = (double) (k + (num_rows * 2)); } location_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(location_offset != NULL); for (j = 0; j < (tsk_id_t) num_rows + 1; j++) { location_offset[j] = (tsk_size_t) j * spatial_dimension; } parents = tsk_malloc(num_parents * num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(parents != NULL); for (k = 0; k < num_parents * num_rows; k++) { parents[k] = (tsk_id_t) (k + (num_rows * 4)); } parents_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(parents_offset != NULL); for (j = 0; j < (tsk_id_t) num_rows + 1; j++) { parents_offset[j] = (tsk_size_t) j * num_parents; } metadata = tsk_malloc(num_rows * sizeof(char)); for (k = 0; k < num_rows; k++) { metadata[k] = (char) ((k % 58) + 65); } CU_ASSERT_FATAL(metadata != NULL); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); for (j = 0; j < (tsk_id_t) num_rows + 1; j++) { metadata_offset[j] = (tsk_size_t) j; } ret = tsk_individual_table_set_columns(&table, num_rows, flags, location, location_offset, parents, parents_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.location, location, spatial_dimension * num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.location_offset, location_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parents, parents, num_parents * num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parents_offset, parents_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.location_length, spatial_dimension * num_rows); CU_ASSERT_EQUAL(table.parents_length, num_parents * num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); tsk_individual_table_print_state(&table, _devnull); /* Append another num_rows onto the end */ ret = tsk_individual_table_append_columns(&table, num_rows, flags, location, location_offset, parents, parents_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.flags + num_rows, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata + num_rows, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.location, location, spatial_dimension * num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.location + spatial_dimension * num_rows, location, spatial_dimension * num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parents, parents, num_parents * num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parents + num_parents * num_rows, parents, num_parents * num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.metadata_length, 2 * num_rows); CU_ASSERT_EQUAL(table.parents_length, 2 * num_parents * num_rows); CU_ASSERT_EQUAL(table.location_length, 2 * spatial_dimension * num_rows); tsk_individual_table_print_state(&table, _devnull); ret = tsk_individual_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Truncate back to num_rows */ ret = tsk_individual_table_truncate(&table, num_rows); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.location, location, spatial_dimension * num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.location_offset, location_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parents, parents, num_parents * num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parents_offset, parents_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset, metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.location_length, spatial_dimension * num_rows); CU_ASSERT_EQUAL(table.parents_length, num_parents * num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); tsk_individual_table_print_state(&table, _devnull); ret = tsk_individual_table_truncate(&table, num_rows + 1); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TABLE_POSITION); /* flags can't be NULL */ ret = tsk_individual_table_set_columns(&table, num_rows, NULL, location, location_offset, parents, parents_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* location and location offset must be simultaneously NULL or not */ ret = tsk_individual_table_set_columns(&table, num_rows, flags, location, NULL, parents, parents_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_individual_table_set_columns(&table, num_rows, flags, NULL, location_offset, NULL, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* parents and parents offset must be simultaneously NULL or not */ ret = tsk_individual_table_set_columns(&table, num_rows, flags, location, location_offset, parents, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_individual_table_set_columns(&table, num_rows, flags, location, location_offset, NULL, parents_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* metadata and metadata offset must be simultaneously NULL or not */ ret = tsk_individual_table_set_columns(&table, num_rows, flags, location, location_offset, parents, parents_offset, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_individual_table_set_columns(&table, num_rows, flags, location, location_offset, parents, parents_offset, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* if location and location_offset are both null, all locations are zero length */ num_rows = 10; ret = tsk_individual_table_set_columns( &table, num_rows, flags, NULL, NULL, NULL, NULL, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL( tsk_memcmp(table.location_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.location_length, 0); ret = tsk_individual_table_append_columns( &table, num_rows, flags, NULL, NULL, NULL, NULL, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL( tsk_memcmp(table.location_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.location_offset + num_rows, zeros, num_rows * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.location_length, 0); tsk_individual_table_print_state(&table, _devnull); ret = tsk_individual_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); /* if parents and parents_offset are both null, all parents are zero length */ num_rows = 10; ret = tsk_individual_table_set_columns( &table, num_rows, flags, NULL, NULL, NULL, NULL, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parents_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.parents_length, 0); ret = tsk_individual_table_append_columns( &table, num_rows, flags, NULL, NULL, NULL, NULL, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parents_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parents_offset + num_rows, zeros, num_rows * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.parents_length, 0); tsk_individual_table_print_state(&table, _devnull); ret = tsk_individual_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); /* if metadata and metadata_offset are both null, all metadatas are zero length */ num_rows = 10; ret = tsk_individual_table_set_columns(&table, num_rows, flags, location, location_offset, parents, parents_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, flags, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.location, location, spatial_dimension * num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parents, parents, num_parents * num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.metadata_length, 0); ret = tsk_individual_table_append_columns(&table, num_rows, flags, location, location_offset, parents, parents_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.location, location, spatial_dimension * num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.location + spatial_dimension * num_rows, location, spatial_dimension * num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parents, parents, num_parents * num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.parents + num_parents * num_rows, parents, num_parents * num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_offset + num_rows, zeros, num_rows * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.metadata_length, 0); tsk_individual_table_print_state(&table, _devnull); tsk_individual_table_dump_text(&table, _devnull); /* Test extend method */ ret = tsk_individual_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_init(&table2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Can't extend from self */ ret = tsk_individual_table_extend(&table, &table, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANNOT_EXTEND_FROM_SELF); /* Two empty tables */ CU_ASSERT_TRUE(tsk_individual_table_equals(&table, &table2, 0)); ret = tsk_individual_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_individual_table_equals(&table, &table2, 0)); /* Row out of bounds */ ret = tsk_individual_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); /* Num rows out of bounds */ ret = tsk_individual_table_extend(&table, &table2, num_rows * 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); /* Copy rows in order if index NULL */ ret = tsk_individual_table_set_columns(&table2, num_rows, flags, location, location_offset, parents, parents_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_individual_table_equals(&table, &table2, 0)); ret = tsk_individual_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_individual_table_equals(&table, &table2, 0)); /* Copy nothing if index not NULL but length zero */ ret = tsk_individual_table_extend(&table, &table2, 0, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_individual_table_equals(&table, &table2, 0)); /* Copy first N rows in order if index NULL */ ret = tsk_individual_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_extend(&table, &table2, num_rows / 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_truncate(&table2, num_rows / 2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_individual_table_equals(&table, &table2, 0)); ret = tsk_individual_table_set_columns(&table2, num_rows, flags, location, location_offset, parents, parents_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Copy a subset */ ret = tsk_individual_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_individual_table_equals(&table, &table2, 0)); ret = tsk_individual_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (k = 0; k < num_row_subset; k++) { ret = tsk_individual_table_get_row(&table, (tsk_id_t) k, &individual); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_get_row(&table2, row_subset[k], &individual2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(individual.flags, individual2.flags); CU_ASSERT_EQUAL(individual.location_length, individual2.location_length); CU_ASSERT_EQUAL(individual.parents_length, individual2.parents_length); CU_ASSERT_EQUAL(individual.metadata_length, individual2.metadata_length); CU_ASSERT_EQUAL(tsk_memcmp(individual.location, individual2.location, individual.location_length * sizeof(*individual.location)), 0); CU_ASSERT_EQUAL(tsk_memcmp(individual.parents, individual2.parents, individual.parents_length * sizeof(*individual.parents)), 0); CU_ASSERT_EQUAL(tsk_memcmp(individual.metadata, individual2.metadata, individual.metadata_length * sizeof(*individual.metadata)), 0); } ret = tsk_individual_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(table.metadata_schema_length, 0); CU_ASSERT_EQUAL(table.metadata_schema, NULL); const char *example = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_length = (tsk_size_t) strlen(example); const char *example2 = "A different example 🎄🌳🌴🌲🎋"; tsk_size_t example2_length = (tsk_size_t) strlen(example); tsk_individual_table_set_metadata_schema(&table, example, example_length); CU_ASSERT_EQUAL(table.metadata_schema_length, example_length); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_schema, example, example_length), 0); tsk_individual_table_copy(&table, &table2, TSK_NO_INIT); CU_ASSERT_EQUAL(table.metadata_schema_length, table2.metadata_schema_length); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_schema, table2.metadata_schema, example_length), 0); tsk_individual_table_set_metadata_schema(&table2, example, example_length); CU_ASSERT_TRUE(tsk_individual_table_equals(&table, &table2, 0)); tsk_individual_table_set_metadata_schema(&table2, example2, example2_length); CU_ASSERT_FALSE(tsk_individual_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_individual_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_individual_table_clear(&table); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); ret = tsk_individual_table_free(&table); CU_ASSERT_EQUAL(ret, 0); ret = tsk_individual_table_free(&table2); CU_ASSERT_EQUAL(ret, 0); free(flags); free(location); free(location_offset); free(parents); free(parents_offset); free(metadata); free(metadata_offset); } static void test_individual_table_takeset(void) { int ret = 0; tsk_id_t ret_id; tsk_individual_table_t source_table, table; tsk_size_t num_rows = 100; tsk_id_t j; tsk_size_t k; tsk_flags_t *flags; double *location; tsk_id_t *parents; char *metadata; tsk_size_t *metadata_offset; tsk_size_t *parents_offset; tsk_size_t *location_offset; tsk_size_t spatial_dimension = 2; tsk_size_t num_parents = 3; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; double test_location[spatial_dimension]; tsk_id_t test_parents[num_parents]; tsk_size_t zeros[num_rows + 1]; tsk_memset(zeros, 0, (num_rows + 1) * sizeof(tsk_size_t)); /* Make a table to copy from */ ret = tsk_individual_table_init(&source_table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (k = 0; k < spatial_dimension; k++) { test_location[k] = (double) k; } for (k = 0; k < num_parents; k++) { test_parents[k] = (tsk_id_t) k + 42; } for (j = 0; j < (tsk_id_t) num_rows; j++) { ret_id = tsk_individual_table_add_row(&source_table, (tsk_flags_t) j, test_location, spatial_dimension, test_parents, num_parents, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, j); } /* Prepare arrays to be taken */ flags = tsk_malloc(num_rows * sizeof(tsk_flags_t)); CU_ASSERT_FATAL(flags != NULL); tsk_memcpy(flags, source_table.flags, num_rows * sizeof(tsk_flags_t)); location = tsk_malloc(spatial_dimension * num_rows * sizeof(double)); CU_ASSERT_FATAL(location != NULL); tsk_memcpy( location, source_table.location, spatial_dimension * num_rows * sizeof(double)); location_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(location_offset != NULL); tsk_memcpy(location_offset, source_table.location_offset, (num_rows + 1) * sizeof(tsk_size_t)); parents = tsk_malloc(num_parents * num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(parents != NULL); tsk_memcpy(parents, source_table.parents, num_parents * num_rows * sizeof(tsk_id_t)); parents_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(parents_offset != NULL); tsk_memcpy(parents_offset, source_table.parents_offset, (num_rows + 1) * sizeof(tsk_size_t)); metadata = tsk_malloc(num_rows * test_metadata_length * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); tsk_memcpy( metadata, source_table.metadata, num_rows * test_metadata_length * sizeof(char)); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); tsk_memcpy(metadata_offset, source_table.metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_individual_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add one row so that we can check takeset frees it */ ret_id = tsk_individual_table_add_row(&table, (tsk_flags_t) 1, test_location, spatial_dimension, test_parents, num_parents, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_individual_table_takeset_columns(&table, num_rows, flags, location, location_offset, parents, parents_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_individual_table_equals(&source_table, &table, 0)); /* Test error states, all of these must not take the array, or free existing */ /* location and location offset must be simultaneously NULL or not */ ret = tsk_individual_table_takeset_columns(&table, num_rows, flags, location, NULL, parents, parents_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_individual_table_takeset_columns(&table, num_rows, flags, NULL, location_offset, NULL, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* parents and parents offset must be simultaneously NULL or not */ ret = tsk_individual_table_takeset_columns(&table, num_rows, flags, location, location_offset, parents, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_individual_table_takeset_columns(&table, num_rows, flags, location, location_offset, NULL, parents_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* metadata and metadata offset must be simultaneously NULL or not */ ret = tsk_individual_table_takeset_columns(&table, num_rows, flags, location, location_offset, parents, parents_offset, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_individual_table_takeset_columns(&table, num_rows, flags, location, location_offset, parents, parents_offset, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Truncation after takeset keeps memory and max_rows */ ret = tsk_individual_table_clear(&table); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(table.max_rows, num_rows); /* if ragged array and offset are both null, all entries are zero length, NULL flags mean all zero entries */ num_rows = 10; ret = tsk_individual_table_takeset_columns( &table, num_rows, NULL, NULL, NULL, NULL, NULL, NULL, NULL); CU_ASSERT_EQUAL(tsk_memcmp(table.flags, zeros, num_rows * sizeof(tsk_flags_t)), 0); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL( tsk_memcmp(table.location_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.location_length, 0); CU_ASSERT_EQUAL( tsk_memcmp(table.parents_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.parents_length, 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_offset, zeros, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.metadata_length, 0); ret = tsk_individual_table_free(&table); CU_ASSERT_EQUAL(ret, 0); ret = tsk_individual_table_free(&source_table); CU_ASSERT_EQUAL(ret, 0); } static void test_individual_table_update_row(void) { int ret; tsk_id_t ret_id; tsk_individual_table_t table; tsk_individual_t row; double location[] = { 0, 1, 2 }; tsk_id_t parents[] = { 0, 1, 2 }; const char *metadata = "ABC"; ret = tsk_individual_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_individual_table_add_row(&table, 0, location, 1, parents, 1, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row(&table, 1, location, 2, parents, 2, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row(&table, 2, location, 3, parents, 3, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_individual_table_update_row( &table, 0, 1, &location[1], 1, &parents[1], 1, &metadata[1], 1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 1); CU_ASSERT_EQUAL_FATAL(row.location_length, 1); CU_ASSERT_EQUAL_FATAL(row.location[0], 1.0); CU_ASSERT_EQUAL_FATAL(row.parents_length, 1); CU_ASSERT_EQUAL_FATAL(row.parents[0], 1); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_individual_table_update_row(&table, 0, row.flags + 1, row.location, row.location_length, row.parents, row.parents_length, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 2); CU_ASSERT_EQUAL_FATAL(row.location_length, 1); CU_ASSERT_EQUAL_FATAL(row.location[0], 1.0); CU_ASSERT_EQUAL_FATAL(row.parents_length, 1); CU_ASSERT_EQUAL_FATAL(row.parents[0], 1); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_individual_table_update_row(&table, 0, row.flags, location, 1, row.parents, row.parents_length, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 2); CU_ASSERT_EQUAL_FATAL(row.location_length, 1); CU_ASSERT_EQUAL_FATAL(row.location[0], 0.0); CU_ASSERT_EQUAL_FATAL(row.parents_length, 1); CU_ASSERT_EQUAL_FATAL(row.parents[0], 1); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_individual_table_update_row(&table, 0, row.flags, NULL, 0, row.parents, row.parents_length, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 2); CU_ASSERT_EQUAL_FATAL(row.location_length, 0); CU_ASSERT_EQUAL_FATAL(row.parents_length, 1); CU_ASSERT_EQUAL_FATAL(row.parents[0], 1); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_individual_table_update_row( &table, 0, 2, location, 3, parents, 3, metadata, 3); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 2); CU_ASSERT_EQUAL_FATAL(row.location_length, 3); CU_ASSERT_EQUAL_FATAL(row.location[0], 0); CU_ASSERT_EQUAL_FATAL(row.location[1], 1); CU_ASSERT_EQUAL_FATAL(row.location[2], 2); CU_ASSERT_EQUAL_FATAL(row.parents_length, 3); CU_ASSERT_EQUAL_FATAL(row.parents[0], 0); CU_ASSERT_EQUAL_FATAL(row.parents[1], 1); CU_ASSERT_EQUAL_FATAL(row.parents[2], 2); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_individual_table_update_row(&table, 1, 5, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_get_row(&table, 1, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 5); CU_ASSERT_EQUAL_FATAL(row.location_length, 0); CU_ASSERT_EQUAL_FATAL(row.parents_length, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 0); ret = tsk_individual_table_get_row(&table, 2, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.flags, 2); CU_ASSERT_EQUAL_FATAL(row.location_length, 3); CU_ASSERT_EQUAL_FATAL(row.location[0], 0); CU_ASSERT_EQUAL_FATAL(row.location[1], 1); CU_ASSERT_EQUAL_FATAL(row.location[2], 2); CU_ASSERT_EQUAL_FATAL(row.parents_length, 3); CU_ASSERT_EQUAL_FATAL(row.parents[0], 0); CU_ASSERT_EQUAL_FATAL(row.parents[1], 1); CU_ASSERT_EQUAL_FATAL(row.parents[2], 2); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_individual_table_update_row(&table, 3, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); tsk_individual_table_free(&table); } static void test_individual_table_keep_rows(void) { int ret; tsk_id_t ret_id; tsk_individual_t row; double location[] = { 0, 1, 2 }; tsk_id_t parents[] = { -1, 1, -1 }; const char *metadata = "ABC"; tsk_bool_t keep[3] = { 1, 1, 1 }; tsk_id_t indexes[] = { 0, 1, 2 }; tsk_id_t id_map[3]; tsk_individual_table_t source, t1, t2; tsk_size_t j; ret = tsk_individual_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_individual_table_add_row(&source, 0, location, 1, parents, 1, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row(&source, 1, location, 2, parents, 2, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row(&source, 2, location, 3, parents, 3, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_individual_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_individual_table_equals(&t1, &source, 0)); ret = tsk_individual_table_keep_rows(&t1, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_individual_table_equals(&t1, &source, 0)); CU_ASSERT_EQUAL_FATAL(id_map[0], 0); CU_ASSERT_EQUAL_FATAL(id_map[1], 1); CU_ASSERT_EQUAL_FATAL(id_map[2], 2); keep[0] = 0; keep[1] = 0; keep[2] = 0; ret = tsk_individual_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 0); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], -1); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_individual_table_copy(&source, &t1, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = 0; keep[1] = 1; keep[2] = 0; ret = tsk_individual_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 1); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], 0); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_individual_table_get_row(&t1, 0, &row); CU_ASSERT_EQUAL_FATAL(row.flags, 1); CU_ASSERT_EQUAL_FATAL(row.parents_length, 2); CU_ASSERT_EQUAL_FATAL(row.parents[0], -1); CU_ASSERT_EQUAL_FATAL(row.parents[1], 0); CU_ASSERT_EQUAL_FATAL(row.location_length, 2); CU_ASSERT_EQUAL_FATAL(row.location[0], 0); CU_ASSERT_EQUAL_FATAL(row.location[1], 1); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 2); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); tsk_individual_table_free(&t1); keep[0] = 0; keep[1] = 0; keep[2] = 0; /* Keeping first n rows equivalent to truncate */ for (j = 0; j < source.num_rows; j++) { ret = tsk_individual_table_copy(&source, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_truncate(&t1, j + 1); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[j] = 1; ret = tsk_individual_table_keep_rows(&t2, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_individual_table_equals(&t1, &t2, 0)); /* Adding the remaining rows back on to the table gives the original * table */ ret = tsk_individual_table_extend( &t2, &source, source.num_rows - j - 1, indexes + j + 1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_individual_table_equals(&source, &t2, 0)); tsk_individual_table_free(&t1); tsk_individual_table_free(&t2); } tsk_individual_table_free(&source); } static void test_individual_table_keep_rows_parent_references(void) { int ret; tsk_id_t ret_id; tsk_individual_table_t source, t; tsk_bool_t keep[] = { 1, 1, 1, 1 }; tsk_id_t parents[] = { -1, 1, 2 }; tsk_id_t id_map[4]; ret = tsk_individual_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_individual_table_add_row(&source, 0, NULL, 0, parents, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row(&source, 0, NULL, 0, parents, 3, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row(&source, 0, NULL, 0, parents, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row(&source, 0, NULL, 0, parents, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_individual_table_copy(&source, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* OOB errors */ t.parents[0] = -2; ret = tsk_individual_table_keep_rows(&t, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(t.num_rows, 4); t.parents[0] = 4; ret = tsk_individual_table_keep_rows(&t, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(t.num_rows, 4); /* But ignored if row is not kept */ keep[0] = false; ret = tsk_individual_table_keep_rows(&t, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_individual_table_free(&t); ret = tsk_individual_table_copy(&source, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Try to remove referenced row 2 */ keep[0] = true; keep[2] = false; ret = tsk_individual_table_keep_rows(&t, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_KEEP_ROWS_MAP_TO_DELETED); CU_ASSERT_TRUE(tsk_individual_table_equals(&source, &t, 0)); tsk_individual_table_free(&t); ret = tsk_individual_table_copy(&source, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* remove unreferenced row 0 */ keep[0] = false; keep[2] = true; ret = tsk_individual_table_keep_rows(&t, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.num_rows, 3); CU_ASSERT_EQUAL_FATAL(t.parents[0], TSK_NULL); CU_ASSERT_EQUAL_FATAL(t.parents[1], 0); CU_ASSERT_EQUAL_FATAL(t.parents[2], 1); tsk_individual_table_free(&t); /* Check that we don't change the table in error cases. */ source.parents[1] = -2; ret = tsk_individual_table_copy(&source, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = true; ret = tsk_individual_table_keep_rows(&t, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); CU_ASSERT_TRUE(tsk_individual_table_equals(&source, &t, 0)); tsk_individual_table_free(&t); /* Check that we don't change the table in error cases. */ source.parents[1] = 0; ret = tsk_individual_table_copy(&source, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = false; ret = tsk_individual_table_keep_rows(&t, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_KEEP_ROWS_MAP_TO_DELETED); CU_ASSERT_TRUE(tsk_individual_table_equals(&source, &t, 0)); tsk_individual_table_free(&t); tsk_individual_table_free(&source); } static void test_population_table(void) { int ret; tsk_id_t ret_id; tsk_population_table_t table, table2; tsk_size_t num_rows = 100; tsk_size_t max_len = 20; tsk_size_t k, len; tsk_id_t j; char *metadata; char c[max_len + 1]; tsk_size_t *metadata_offset; tsk_population_t population, population2; tsk_id_t row_subset[6] = { 1, 9, 1, 0, 2, 2 }; tsk_size_t num_row_subset = 6; for (j = 0; j < (tsk_id_t) max_len; j++) { c[j] = (char) ('A' + j); } ret = tsk_population_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_population_table_set_max_rows_increment(&table, 1); tsk_population_table_set_max_metadata_length_increment(&table, 1); tsk_population_table_print_state(&table, _devnull); ret = tsk_population_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Adding zero length metadata with NULL should be fine */ ret_id = tsk_population_table_add_row(&table, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); CU_ASSERT_EQUAL(table.num_rows, 1); CU_ASSERT_EQUAL(table.metadata_offset[0], 0); CU_ASSERT_EQUAL(table.metadata_offset[1], 0); tsk_population_table_clear(&table); CU_ASSERT_EQUAL(table.num_rows, 0); len = 0; for (j = 0; j < (tsk_id_t) num_rows; j++) { k = TSK_MIN((tsk_size_t) j + 1, max_len); ret_id = tsk_population_table_add_row(&table, c, k); CU_ASSERT_EQUAL_FATAL(ret_id, j); CU_ASSERT_EQUAL(table.metadata_offset[j], len); CU_ASSERT_EQUAL(table.num_rows, (tsk_size_t) j + 1); len += k; CU_ASSERT_EQUAL(table.metadata_offset[j + 1], len); CU_ASSERT_EQUAL(table.metadata_length, len); ret = tsk_population_table_get_row(&table, (tsk_id_t) j, &population); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(population.id, j); CU_ASSERT_EQUAL(population.metadata_length, k); CU_ASSERT_NSTRING_EQUAL(population.metadata, c, k); } /* Test equality with and without metadata */ tsk_population_table_copy(&table, &table2, 0); CU_ASSERT_TRUE(tsk_population_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_population_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the metadata values */ table2.metadata[0] = 0; CU_ASSERT_FALSE(tsk_population_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_population_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Change the last metadata entry */ table2.metadata_offset[table2.num_rows] = table2.metadata_offset[table2.num_rows - 1]; CU_ASSERT_FALSE(tsk_population_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_population_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); /* Delete all metadata */ tsk_memset(table2.metadata_offset, 0, (table2.num_rows + 1) * sizeof(*table2.metadata_offset)); CU_ASSERT_FALSE(tsk_population_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_population_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_population_table_free(&table2); ret = tsk_population_table_get_row(&table, (tsk_id_t) num_rows, &population); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tsk_population_table_print_state(&table, _devnull); ret = tsk_population_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); num_rows *= 2; metadata = tsk_malloc(num_rows * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); for (j = 0; j < (tsk_id_t) num_rows; j++) { metadata[j] = 'M'; metadata_offset[j] = (tsk_size_t) j; } metadata_offset[num_rows] = num_rows; ret = tsk_population_table_set_columns(&table, num_rows, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); /* Append another num_rows */ ret = tsk_population_table_append_columns( &table, num_rows, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata + num_rows, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.metadata_length, 2 * num_rows); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); /* Truncate back to num_rows */ ret = tsk_population_table_truncate(&table, num_rows); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.metadata_length, num_rows); ret = tsk_population_table_truncate(&table, num_rows + 1); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TABLE_POSITION); /* Metadata = NULL gives an error */ ret = tsk_population_table_set_columns(&table, num_rows, NULL, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_population_table_set_columns(&table, num_rows, metadata, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_population_table_set_columns(&table, num_rows, NULL, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Test extend method */ ret = tsk_population_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_init(&table2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Can't extend from self */ ret = tsk_population_table_extend(&table, &table, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANNOT_EXTEND_FROM_SELF); /* Two empty tables */ CU_ASSERT_TRUE(tsk_population_table_equals(&table, &table2, 0)); ret = tsk_population_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_population_table_equals(&table, &table2, 0)); /* Row out of bounds */ ret = tsk_population_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); /* Num rows out of bounds */ ret = tsk_population_table_extend(&table, &table2, num_rows * 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); /* Copy rows in order if index NULL */ ret = tsk_population_table_set_columns(&table2, num_rows, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_population_table_equals(&table, &table2, 0)); ret = tsk_population_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_population_table_equals(&table, &table2, 0)); /* Copy nothing if index not NULL but length zero */ ret = tsk_population_table_extend(&table, &table2, 0, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_population_table_equals(&table, &table2, 0)); /* Copy first N rows in order if index NULL */ ret = tsk_population_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_extend(&table, &table2, num_rows / 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_truncate(&table2, num_rows / 2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_population_table_equals(&table, &table2, 0)); ret = tsk_population_table_set_columns(&table2, num_rows, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Copy a subset */ ret = tsk_population_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_population_table_equals(&table, &table2, 0)); ret = tsk_population_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (k = 0; k < num_row_subset; k++) { ret = tsk_population_table_get_row(&table, (tsk_id_t) k, &population); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_get_row(&table2, row_subset[k], &population2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(population.metadata_length, population2.metadata_length); CU_ASSERT_EQUAL(tsk_memcmp(population.metadata, population2.metadata, population.metadata_length * sizeof(*population.metadata)), 0); } /* Test for bad offsets */ metadata_offset[0] = 1; ret = tsk_population_table_set_columns(&table, num_rows, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); metadata_offset[0] = 0; metadata_offset[num_rows] = 0; ret = tsk_population_table_set_columns(&table, num_rows, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); ret = tsk_population_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(table.metadata_schema_length, 0); CU_ASSERT_EQUAL(table.metadata_schema, NULL); const char *example = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_length = (tsk_size_t) strlen(example); const char *example2 = "A different example 🎄🌳🌴🌲🎋"; tsk_size_t example2_length = (tsk_size_t) strlen(example); tsk_population_table_set_metadata_schema(&table, example, example_length); CU_ASSERT_EQUAL(table.metadata_schema_length, example_length); CU_ASSERT_EQUAL(tsk_memcmp(table.metadata_schema, example, example_length), 0); tsk_population_table_copy(&table, &table2, TSK_NO_INIT); CU_ASSERT_EQUAL(table.metadata_schema_length, table2.metadata_schema_length); CU_ASSERT_EQUAL( tsk_memcmp(table.metadata_schema, table2.metadata_schema, example_length), 0); tsk_population_table_set_metadata_schema(&table2, example, example_length); CU_ASSERT_TRUE(tsk_population_table_equals(&table, &table2, 0)); tsk_population_table_set_metadata_schema(&table2, example2, example2_length); CU_ASSERT_FALSE(tsk_population_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_population_table_equals(&table, &table2, TSK_CMP_IGNORE_METADATA)); tsk_population_table_clear(&table); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.metadata_length, 0); tsk_population_table_free(&table); CU_ASSERT_EQUAL(ret, 0); tsk_population_table_free(&table2); CU_ASSERT_EQUAL(ret, 0); free(metadata); free(metadata_offset); } static void test_population_table_takeset(void) { int ret = 0; tsk_id_t ret_id; tsk_population_table_t source_table, table; tsk_size_t num_rows = 100; tsk_id_t j; char *metadata; tsk_size_t *metadata_offset; const char *test_metadata = "test"; tsk_size_t test_metadata_length = 4; tsk_size_t zeros[num_rows + 1]; tsk_memset(zeros, 0, (num_rows + 1) * sizeof(tsk_size_t)); /* Make a table to copy from */ ret = tsk_population_table_init(&source_table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_rows; j++) { ret_id = tsk_population_table_add_row( &source_table, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, j); } /* Prepare arrays to be taken */ metadata = tsk_malloc(num_rows * test_metadata_length * sizeof(char)); CU_ASSERT_FATAL(metadata != NULL); tsk_memcpy( metadata, source_table.metadata, num_rows * test_metadata_length * sizeof(char)); metadata_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(metadata_offset != NULL); tsk_memcpy(metadata_offset, source_table.metadata_offset, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_population_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add one row so that we can check takeset frees it */ ret_id = tsk_population_table_add_row(&table, test_metadata, test_metadata_length); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_population_table_takeset_columns( &table, num_rows, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_population_table_equals(&source_table, &table, 0)); /* Test error states, all of these must not take the array, or free existing */ ret = tsk_population_table_takeset_columns(&table, num_rows, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_population_table_takeset_columns(&table, num_rows, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_population_table_takeset_columns(&table, num_rows, NULL, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Test bad offset */ metadata_offset[0] = 1; ret = tsk_population_table_takeset_columns( &table, num_rows, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); /* Truncation after takeset keeps memory and max_rows */ ret = tsk_population_table_clear(&table); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(table.max_rows, num_rows); ret = tsk_population_table_free(&table); CU_ASSERT_EQUAL(ret, 0); ret = tsk_population_table_free(&source_table); CU_ASSERT_EQUAL(ret, 0); } static void test_population_table_update_row(void) { int ret; tsk_id_t ret_id; tsk_population_table_t table; tsk_population_t row; const char *metadata = "ABC"; ret = tsk_population_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_population_table_add_row(&table, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&table, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&table, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_population_table_update_row(&table, 0, &metadata[1], 1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_population_table_update_row(&table, 0, row.metadata, row.metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 1); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'B'); ret = tsk_population_table_update_row(&table, 0, metadata, 3); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_population_table_update_row(&table, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_get_row(&table, 1, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 0); ret = tsk_population_table_get_row(&table, 2, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 3); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.metadata[2], 'C'); ret = tsk_population_table_update_row(&table, 3, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tsk_population_table_free(&table); } static void test_population_table_keep_rows(void) { int ret; tsk_id_t ret_id; tsk_size_t j; tsk_population_table_t source, t1, t2; tsk_population_t row; const char *metadata = "ABC"; tsk_bool_t keep[3] = { 1, 1, 1 }; tsk_id_t id_map[3]; tsk_id_t indexes[] = { 0, 1, 2 }; ret = tsk_population_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_population_table_add_row(&source, metadata, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&source, metadata, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&source, metadata, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_population_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_population_table_equals(&t1, &source, 0)); ret = tsk_population_table_keep_rows(&t1, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_population_table_equals(&t1, &source, 0)); CU_ASSERT_EQUAL_FATAL(id_map[0], 0); CU_ASSERT_EQUAL_FATAL(id_map[1], 1); CU_ASSERT_EQUAL_FATAL(id_map[2], 2); keep[0] = 0; keep[1] = 0; keep[2] = 0; ret = tsk_population_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 0); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], -1); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_population_table_copy(&source, &t1, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = 0; keep[1] = 1; keep[2] = 0; ret = tsk_population_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 1); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], 0); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_population_table_get_row(&t1, 0, &row); CU_ASSERT_EQUAL_FATAL(row.metadata_length, 2); CU_ASSERT_EQUAL_FATAL(row.metadata[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.metadata[1], 'B'); tsk_population_table_free(&t1); keep[0] = 0; keep[1] = 0; keep[2] = 0; /* Keeping first n rows equivalent to truncate */ for (j = 0; j < source.num_rows; j++) { ret = tsk_population_table_copy(&source, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_truncate(&t1, j + 1); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[j] = 1; ret = tsk_population_table_keep_rows(&t2, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_population_table_equals(&t1, &t2, 0)); /* Adding the remaining rows back on to the table gives the original * table */ ret = tsk_population_table_extend( &t2, &source, source.num_rows - j - 1, indexes + j + 1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_population_table_equals(&source, &t2, 0)); tsk_population_table_free(&t1); tsk_population_table_free(&t2); } tsk_population_table_free(&source); } static void test_provenance_table(void) { int ret; tsk_id_t ret_id; tsk_provenance_table_t table, table2; tsk_size_t num_rows = 100; tsk_size_t j; char *timestamp; tsk_size_t *timestamp_offset; const char *test_timestamp = "2017-12-06T20:40:25+00:00"; tsk_size_t test_timestamp_length = (tsk_size_t) strlen(test_timestamp); char timestamp_copy[test_timestamp_length + 1]; char *record; tsk_size_t *record_offset; const char *test_record = "{\"json\"=1234}"; tsk_size_t test_record_length = (tsk_size_t) strlen(test_record); char record_copy[test_record_length + 1]; tsk_provenance_t provenance, provenance2; tsk_id_t row_subset[6] = { 1, 9, 1, 0, 2, 2 }; tsk_size_t num_row_subset = 6; timestamp_copy[test_timestamp_length] = '\0'; record_copy[test_record_length] = '\0'; ret = tsk_provenance_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_provenance_table_set_max_rows_increment(&table, 1); tsk_provenance_table_set_max_timestamp_length_increment(&table, 1); tsk_provenance_table_set_max_record_length_increment(&table, 1); tsk_provenance_table_print_state(&table, _devnull); ret = tsk_provenance_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_rows; j++) { ret_id = tsk_provenance_table_add_row(&table, test_timestamp, test_timestamp_length, test_record, test_record_length); CU_ASSERT_EQUAL_FATAL(ret_id, (tsk_id_t) j); CU_ASSERT_EQUAL(table.timestamp_length, (j + 1) * test_timestamp_length); CU_ASSERT_EQUAL(table.timestamp_offset[j + 1], table.timestamp_length); CU_ASSERT_EQUAL(table.record_length, (j + 1) * test_record_length); CU_ASSERT_EQUAL(table.record_offset[j + 1], table.record_length); /* check the timestamp */ tsk_memcpy(timestamp_copy, table.timestamp + table.timestamp_offset[j], test_timestamp_length); CU_ASSERT_NSTRING_EQUAL(timestamp_copy, test_timestamp, test_timestamp_length); /* check the record */ tsk_memcpy( record_copy, table.record + table.record_offset[j], test_record_length); CU_ASSERT_NSTRING_EQUAL(record_copy, test_record, test_record_length); ret = tsk_provenance_table_get_row(&table, (tsk_id_t) j, &provenance); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(provenance.id, (tsk_id_t) j); CU_ASSERT_EQUAL(provenance.timestamp_length, test_timestamp_length); CU_ASSERT_NSTRING_EQUAL( provenance.timestamp, test_timestamp, test_timestamp_length); CU_ASSERT_EQUAL(provenance.record_length, test_record_length); CU_ASSERT_NSTRING_EQUAL(provenance.record, test_record, test_record_length); } ret = tsk_provenance_table_get_row(&table, (tsk_id_t) num_rows, &provenance); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); tsk_provenance_table_print_state(&table, _devnull); ret = tsk_provenance_table_dump_text(&table, _devnull); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_provenance_table_clear(&table); CU_ASSERT_EQUAL(table.num_rows, 0); CU_ASSERT_EQUAL(table.timestamp_length, 0); CU_ASSERT_EQUAL(table.record_length, 0); num_rows *= 2; timestamp = tsk_malloc(num_rows * sizeof(char)); tsk_memset(timestamp, 'a', num_rows * sizeof(char)); CU_ASSERT_FATAL(timestamp != NULL); timestamp_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(timestamp_offset != NULL); record = tsk_malloc(num_rows * sizeof(char)); tsk_memset(record, 'a', num_rows * sizeof(char)); CU_ASSERT_FATAL(record != NULL); record_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(record_offset != NULL); for (j = 0; j < num_rows + 1; j++) { timestamp_offset[j] = j; record_offset[j] = j; } ret = tsk_provenance_table_set_columns( &table, num_rows, timestamp, timestamp_offset, record, record_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.timestamp, timestamp, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.timestamp_offset, timestamp_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.record, record, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.record_offset, record_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.timestamp_length, num_rows); CU_ASSERT_EQUAL(table.record_length, num_rows); tsk_provenance_table_print_state(&table, _devnull); /* Append another num_rows onto the end */ ret = tsk_provenance_table_append_columns( &table, num_rows, timestamp, timestamp_offset, record, record_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.timestamp, timestamp, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.timestamp + num_rows, timestamp, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.record, record, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( tsk_memcmp(table.record + num_rows, record, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(table.num_rows, 2 * num_rows); CU_ASSERT_EQUAL(table.timestamp_length, 2 * num_rows); CU_ASSERT_EQUAL(table.record_length, 2 * num_rows); tsk_provenance_table_print_state(&table, _devnull); /* Truncate back to num_rows */ ret = tsk_provenance_table_truncate(&table, num_rows); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_memcmp(table.timestamp, timestamp, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.timestamp_offset, timestamp_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.record, record, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(tsk_memcmp(table.record_offset, record_offset, (num_rows + 1) * sizeof(tsk_size_t)), 0); CU_ASSERT_EQUAL(table.num_rows, num_rows); CU_ASSERT_EQUAL(table.timestamp_length, num_rows); CU_ASSERT_EQUAL(table.record_length, num_rows); tsk_provenance_table_print_state(&table, _devnull); /* Test equality with and without timestamp */ tsk_provenance_table_copy(&table, &table2, 0); CU_ASSERT_TRUE(tsk_provenance_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_provenance_table_equals(&table, &table2, TSK_CMP_IGNORE_TIMESTAMPS)); /* Change the timestamp values */ table2.timestamp[0] = 0; CU_ASSERT_FALSE(tsk_provenance_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_provenance_table_equals(&table, &table2, TSK_CMP_IGNORE_TIMESTAMPS)); /* Change the last timestamp entry */ table2.timestamp_offset[table2.num_rows] = table2.timestamp_offset[table2.num_rows - 1]; CU_ASSERT_FALSE(tsk_provenance_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_provenance_table_equals(&table, &table2, TSK_CMP_IGNORE_TIMESTAMPS)); /* Delete all timestamps */ tsk_memset(table2.timestamp_offset, 0, (table2.num_rows + 1) * sizeof(*table2.timestamp_offset)); CU_ASSERT_FALSE(tsk_provenance_table_equals(&table, &table2, 0)); CU_ASSERT_TRUE( tsk_provenance_table_equals(&table, &table2, TSK_CMP_IGNORE_TIMESTAMPS)); tsk_provenance_table_free(&table2); /* Test equality with and without timestamp */ tsk_provenance_table_copy(&table, &table2, 0); table2.record_length = 0; CU_ASSERT_FALSE(tsk_provenance_table_equals(&table, &table2, 0)); tsk_provenance_table_free(&table2); ret = tsk_provenance_table_truncate(&table, num_rows + 1); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TABLE_POSITION); /* No arguments can be null */ ret = tsk_provenance_table_set_columns( &table, num_rows, NULL, timestamp_offset, record, record_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_provenance_table_set_columns( &table, num_rows, timestamp, NULL, record, record_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_provenance_table_set_columns( &table, num_rows, timestamp, timestamp_offset, NULL, record_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_provenance_table_set_columns( &table, num_rows, timestamp, timestamp_offset, record, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Test extend method */ ret = tsk_provenance_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_init(&table2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Can't extend from self */ ret = tsk_provenance_table_extend(&table, &table, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANNOT_EXTEND_FROM_SELF); /* Two empty tables */ CU_ASSERT_TRUE(tsk_provenance_table_equals(&table, &table2, 0)); ret = tsk_provenance_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_provenance_table_equals(&table, &table2, 0)); /* Row out of bounds */ ret = tsk_provenance_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); /* Num rows out of bounds */ ret = tsk_provenance_table_extend(&table, &table2, num_rows * 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); /* Copy rows in order if index NULL */ ret = tsk_provenance_table_set_columns( &table2, num_rows, timestamp, timestamp_offset, record, record_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_provenance_table_equals(&table, &table2, 0)); ret = tsk_provenance_table_extend(&table, &table2, table2.num_rows, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_provenance_table_equals(&table, &table2, 0)); /* Copy nothing if index not NULL but length zero */ ret = tsk_provenance_table_extend(&table, &table2, 0, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_provenance_table_equals(&table, &table2, 0)); /* Copy first N rows in order if index NULL */ ret = tsk_provenance_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_extend(&table, &table2, num_rows / 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_truncate(&table2, num_rows / 2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_provenance_table_equals(&table, &table2, 0)); ret = tsk_provenance_table_set_columns( &table2, num_rows, timestamp, timestamp_offset, record, record_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Copy a subset */ ret = tsk_provenance_table_truncate(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_provenance_table_equals(&table, &table2, 0)); ret = tsk_provenance_table_extend(&table, &table2, num_row_subset, row_subset, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_row_subset; j++) { ret = tsk_provenance_table_get_row(&table, (tsk_id_t) j, &provenance); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_get_row(&table2, row_subset[j], &provenance2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(provenance.timestamp_length, provenance2.timestamp_length); CU_ASSERT_EQUAL(provenance.record_length, provenance2.record_length); CU_ASSERT_EQUAL(tsk_memcmp(provenance.timestamp, provenance2.timestamp, provenance.timestamp_length * sizeof(*provenance.timestamp)), 0); CU_ASSERT_EQUAL(tsk_memcmp(provenance.record, provenance2.record, provenance.record_length * sizeof(*provenance.record)), 0); } tsk_provenance_table_free(&table); tsk_provenance_table_free(&table2); free(timestamp); free(timestamp_offset); free(record); free(record_offset); } static void test_provenance_table_takeset(void) { int ret = 0; tsk_id_t ret_id; tsk_provenance_table_t source_table, table; tsk_size_t num_rows = 100; tsk_id_t j; char *timestamp; tsk_size_t *timestamp_offset; char *record; tsk_size_t *record_offset; const char *test_timestamp = "red"; tsk_size_t test_timestamp_length = 3; const char *test_record = "test"; tsk_size_t test_record_length = 4; tsk_size_t zeros[num_rows + 1]; tsk_id_t neg_ones[num_rows]; tsk_memset(zeros, 0, (num_rows + 1) * sizeof(tsk_size_t)); tsk_memset(neg_ones, 0xff, num_rows * sizeof(tsk_id_t)); /* Make a table to copy from */ ret = tsk_provenance_table_init(&source_table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) num_rows; j++) { ret_id = tsk_provenance_table_add_row(&source_table, test_timestamp, test_timestamp_length, test_record, test_record_length); CU_ASSERT_EQUAL_FATAL(ret_id, j); } /* Prepare arrays to be taken */ timestamp = tsk_malloc(num_rows * test_timestamp_length * sizeof(char)); CU_ASSERT_FATAL(timestamp != NULL); tsk_memcpy(timestamp, source_table.timestamp, num_rows * test_timestamp_length * sizeof(char)); timestamp_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(timestamp_offset != NULL); tsk_memcpy(timestamp_offset, source_table.timestamp_offset, (num_rows + 1) * sizeof(tsk_size_t)); record = tsk_malloc(num_rows * test_record_length * sizeof(char)); CU_ASSERT_FATAL(record != NULL); tsk_memcpy( record, source_table.record, num_rows * test_record_length * sizeof(char)); record_offset = tsk_malloc((num_rows + 1) * sizeof(tsk_size_t)); CU_ASSERT_FATAL(record_offset != NULL); tsk_memcpy( record_offset, source_table.record_offset, (num_rows + 1) * sizeof(tsk_size_t)); ret = tsk_provenance_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add one row so that we can check takeset frees it */ ret_id = tsk_provenance_table_add_row( &table, test_timestamp, test_timestamp_length, test_record, test_record_length); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_provenance_table_takeset_columns( &table, num_rows, timestamp, timestamp_offset, record, record_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_provenance_table_equals(&source_table, &table, 0)); /* Test error states, all of these must not take the array, or free existing */ ret = tsk_provenance_table_takeset_columns( &table, num_rows, NULL, timestamp_offset, record, record_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_provenance_table_takeset_columns( &table, num_rows, timestamp, NULL, record, record_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_provenance_table_takeset_columns( &table, num_rows, timestamp, timestamp_offset, NULL, record_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_provenance_table_takeset_columns( &table, num_rows, timestamp, timestamp_offset, record, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Bad offsets */ timestamp_offset[0] = 1; ret = tsk_provenance_table_takeset_columns( &table, num_rows, timestamp, timestamp_offset, record, record_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); timestamp_offset[0] = 0; record_offset[0] = 1; ret = tsk_provenance_table_takeset_columns( &table, num_rows, timestamp, timestamp_offset, record, record_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); /* Truncation after takeset keeps memory and max_rows */ ret = tsk_provenance_table_clear(&table); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(table.max_rows, num_rows); ret = tsk_provenance_table_free(&table); CU_ASSERT_EQUAL(ret, 0); ret = tsk_provenance_table_free(&source_table); CU_ASSERT_EQUAL(ret, 0); } static void test_provenance_table_update_row(void) { int ret; tsk_id_t ret_id; tsk_provenance_table_t table; tsk_provenance_t row; const char *timestamp = "XYZ"; const char *record = "ABC"; ret = tsk_provenance_table_init(&table, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_provenance_table_add_row(&table, timestamp, 1, record, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_provenance_table_add_row(&table, timestamp, 2, record, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_provenance_table_add_row(&table, timestamp, 3, record, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_provenance_table_update_row(&table, 0, ×tamp[1], 1, &record[1], 1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.timestamp_length, 1); CU_ASSERT_EQUAL_FATAL(row.timestamp[0], 'Y'); CU_ASSERT_EQUAL_FATAL(row.record_length, 1); CU_ASSERT_EQUAL_FATAL(row.record[0], 'B'); ret = tsk_provenance_table_update_row( &table, 0, row.timestamp, row.timestamp_length, row.record, row.record_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.timestamp_length, 1); CU_ASSERT_EQUAL_FATAL(row.timestamp[0], 'Y'); CU_ASSERT_EQUAL_FATAL(row.record_length, 1); CU_ASSERT_EQUAL_FATAL(row.record[0], 'B'); ret = tsk_provenance_table_update_row(&table, 0, row.timestamp, row.timestamp_length - 1, row.record, row.record_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.timestamp_length, 0); CU_ASSERT_EQUAL_FATAL(row.record_length, 1); CU_ASSERT_EQUAL_FATAL(row.record[0], 'B'); ret = tsk_provenance_table_update_row(&table, 0, timestamp, 3, record, 3); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_get_row(&table, 0, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.timestamp_length, 3); CU_ASSERT_EQUAL_FATAL(row.timestamp[0], 'X'); CU_ASSERT_EQUAL_FATAL(row.timestamp[1], 'Y'); CU_ASSERT_EQUAL_FATAL(row.timestamp[2], 'Z'); CU_ASSERT_EQUAL_FATAL(row.record_length, 3); CU_ASSERT_EQUAL_FATAL(row.record[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.record[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.record[2], 'C'); ret = tsk_provenance_table_update_row(&table, 1, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_get_row(&table, 1, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.timestamp_length, 0); CU_ASSERT_EQUAL_FATAL(row.record_length, 0); ret = tsk_provenance_table_get_row(&table, 2, &row); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(row.timestamp_length, 3); CU_ASSERT_EQUAL_FATAL(row.timestamp[0], 'X'); CU_ASSERT_EQUAL_FATAL(row.timestamp[1], 'Y'); CU_ASSERT_EQUAL_FATAL(row.timestamp[2], 'Z'); CU_ASSERT_EQUAL_FATAL(row.record_length, 3); CU_ASSERT_EQUAL_FATAL(row.record[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.record[1], 'B'); CU_ASSERT_EQUAL_FATAL(row.record[2], 'C'); ret = tsk_provenance_table_update_row(&table, 3, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); tsk_provenance_table_free(&table); } static void test_provenance_table_keep_rows(void) { int ret; tsk_id_t ret_id; tsk_size_t j; tsk_provenance_table_t source, t1, t2; tsk_provenance_t row; const char *timestamp = "XYZ"; const char *record = "ABC"; tsk_bool_t keep[3] = { 1, 1, 1 }; tsk_id_t indexes[] = { 0, 1, 2 }; tsk_id_t id_map[3]; ret = tsk_provenance_table_init(&source, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_provenance_table_add_row(&source, timestamp, 1, record, 1); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_provenance_table_add_row(&source, timestamp, 2, record, 2); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_provenance_table_add_row(&source, timestamp, 3, record, 3); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_provenance_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_provenance_table_equals(&t1, &source, 0)); ret = tsk_provenance_table_keep_rows(&t1, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_provenance_table_equals(&t1, &source, 0)); CU_ASSERT_EQUAL_FATAL(id_map[0], 0); CU_ASSERT_EQUAL_FATAL(id_map[1], 1); CU_ASSERT_EQUAL_FATAL(id_map[2], 2); keep[0] = 0; keep[1] = 0; keep[2] = 0; ret = tsk_provenance_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 0); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], -1); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_provenance_table_copy(&source, &t1, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[0] = 0; keep[1] = 1; keep[2] = 0; ret = tsk_provenance_table_keep_rows(&t1, keep, 0, id_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t1.num_rows, 1); CU_ASSERT_EQUAL_FATAL(id_map[0], -1); CU_ASSERT_EQUAL_FATAL(id_map[1], 0); CU_ASSERT_EQUAL_FATAL(id_map[2], -1); ret = tsk_provenance_table_get_row(&t1, 0, &row); CU_ASSERT_EQUAL_FATAL(row.timestamp_length, 2); CU_ASSERT_EQUAL_FATAL(row.timestamp[0], 'X'); CU_ASSERT_EQUAL_FATAL(row.timestamp[1], 'Y'); CU_ASSERT_EQUAL_FATAL(row.record_length, 2); CU_ASSERT_EQUAL_FATAL(row.record[0], 'A'); CU_ASSERT_EQUAL_FATAL(row.record[1], 'B'); tsk_provenance_table_free(&t1); keep[0] = 0; keep[1] = 0; keep[2] = 0; /* Keeping first n rows equivalent to truncate */ for (j = 0; j < source.num_rows; j++) { ret = tsk_provenance_table_copy(&source, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_copy(&source, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_truncate(&t1, j + 1); CU_ASSERT_EQUAL_FATAL(ret, 0); keep[j] = 1; ret = tsk_provenance_table_keep_rows(&t2, keep, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_provenance_table_equals(&t1, &t2, 0)); /* Adding the remaining rows back on to the table gives the original * table */ ret = tsk_provenance_table_extend( &t2, &source, source.num_rows - j - 1, indexes + j + 1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_provenance_table_equals(&source, &t2, 0)); tsk_provenance_table_free(&t1); tsk_provenance_table_free(&t2); } tsk_provenance_table_free(&source); } static void test_table_size_increments(void) { int ret; tsk_table_collection_t tables; tsk_size_t new_size; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_metadata_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_location_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.edges.max_metadata_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.sites.max_metadata_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.sites.max_ancestral_state_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_metadata_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_derived_state_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_metadata_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.populations.max_metadata_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_timestamp_length_increment, 0); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_record_length_increment, 0); /* Setting to non-zero sets to that size */ new_size = 1; ret = tsk_individual_table_set_max_rows_increment(&tables.individuals, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows_increment, new_size); ret = tsk_individual_table_set_max_metadata_length_increment( &tables.individuals, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_metadata_length_increment, new_size); ret = tsk_individual_table_set_max_location_length_increment( &tables.individuals, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_location_length_increment, new_size); ret = tsk_node_table_set_max_rows_increment(&tables.nodes, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows_increment, new_size); ret = tsk_node_table_set_max_metadata_length_increment(&tables.nodes, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length_increment, new_size); ret = tsk_edge_table_set_max_rows_increment(&tables.edges, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows_increment, new_size); ret = tsk_edge_table_set_max_metadata_length_increment(&tables.edges, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.edges.max_metadata_length_increment, new_size); ret = tsk_site_table_set_max_rows_increment(&tables.sites, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows_increment, new_size); ret = tsk_site_table_set_max_metadata_length_increment(&tables.sites, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.sites.max_metadata_length_increment, new_size); ret = tsk_site_table_set_max_ancestral_state_length_increment( &tables.sites, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.sites.max_ancestral_state_length_increment, new_size); ret = tsk_mutation_table_set_max_rows_increment(&tables.mutations, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows_increment, new_size); ret = tsk_mutation_table_set_max_metadata_length_increment( &tables.mutations, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_metadata_length_increment, new_size); ret = tsk_mutation_table_set_max_derived_state_length_increment( &tables.mutations, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_derived_state_length_increment, new_size); ret = tsk_migration_table_set_max_rows_increment(&tables.migrations, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows_increment, new_size); ret = tsk_migration_table_set_max_metadata_length_increment( &tables.migrations, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_metadata_length_increment, new_size); ret = tsk_population_table_set_max_rows_increment(&tables.populations, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows_increment, new_size); ret = tsk_population_table_set_max_metadata_length_increment( &tables.populations, new_size); CU_ASSERT_EQUAL_FATAL(tables.populations.max_metadata_length_increment, new_size); ret = tsk_provenance_table_set_max_rows_increment(&tables.provenances, new_size); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows_increment, new_size); ret = tsk_provenance_table_set_max_timestamp_length_increment( &tables.provenances, new_size); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_timestamp_length_increment, new_size); ret = tsk_provenance_table_set_max_record_length_increment( &tables.provenances, new_size); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_record_length_increment, new_size); tsk_table_collection_free(&tables); } static void test_table_expansion(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_table_collection_t tables2; ret = tsk_table_collection_init(&tables2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Individual table */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows, 1); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /*Extending by a small amount results in 1024 rows in the first case*/ ret = tsk_individual_table_extend( &tables.individuals, &tables2.individuals, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows, 1024); /*Extending by an amount that fits doesn't grow the table*/ ret = tsk_individual_table_extend( &tables.individuals, &tables2.individuals, 1023, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows, 1024); /*Extending by an amount that doesn't fit doubles the table*/ ret = tsk_individual_table_extend( &tables.individuals, &tables2.individuals, 1024, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows, 2048); /*Extending by an amount greater than the next double extends to that amount*/ ret = tsk_individual_table_extend( &tables.individuals, &tables2.individuals, 4096, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows, 4097); /*After extending beyond 2^21 subsequent extension doesn't double but adds 2^21*/ ret = tsk_individual_table_extend( &tables.individuals, &tables2.individuals, 2097152, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows, 2097153); ret = tsk_individual_table_extend( &tables.individuals, &tables2.individuals, 2097154, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows, 4194305); /*Extending by more rows than possible results in overflow*/ ret = tsk_individual_table_extend( &tables.individuals, &tables2.individuals, TSK_MAX_ID, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows, 4194305); /*Setting a custom extension uses that*/ ret = tsk_individual_table_set_max_rows_increment(&tables.individuals, 42); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_extend( &tables.individuals, &tables2.individuals, 4194305, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows, 4194305 + 42); /*Setting a custom extension that overflows errors*/ ret = tsk_individual_table_set_max_rows_increment(&tables.individuals, TSK_MAX_ID); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_extend( &tables.individuals, &tables2.individuals, 4194305 + 42 + 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.individuals.max_rows, 4194305 + 42); tsk_table_collection_free(&tables); /* Node table */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows, 1); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /*Extending by a small amount results in 1024 rows in the first case*/ ret = tsk_node_table_extend(&tables.nodes, &tables2.nodes, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows, 1024); /*Extending by an amount that fits doesn't grow the table*/ ret = tsk_node_table_extend(&tables.nodes, &tables2.nodes, 1023, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows, 1024); /*Extending by an amount that doesn't fit doubles the table*/ ret = tsk_node_table_extend(&tables.nodes, &tables2.nodes, 1024, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows, 2048); /*Extending by an amount greater than the next double extends to that amount*/ ret = tsk_node_table_extend(&tables.nodes, &tables2.nodes, 4096, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows, 4097); /*After extending beyond 2^21 subsequent extension doesn't double but adds 2^21*/ ret = tsk_node_table_extend(&tables.nodes, &tables2.nodes, 2097152, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows, 2097153); ret = tsk_node_table_extend(&tables.nodes, &tables2.nodes, 2097154, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows, 4194305); /*Extending by more rows than possible results in overflow*/ ret = tsk_node_table_extend(&tables.nodes, &tables2.nodes, TSK_MAX_ID, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows, 4194305); /*Setting a custom extension uses that*/ ret = tsk_node_table_set_max_rows_increment(&tables.nodes, 42); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_extend(&tables.nodes, &tables2.nodes, 4194305, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows, 4194305 + 42); /*Setting a custom extension that overflows errors*/ ret = tsk_node_table_set_max_rows_increment(&tables.nodes, TSK_MAX_ID); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_extend( &tables.nodes, &tables2.nodes, 4194305 + 42 + 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_rows, 4194305 + 42); tsk_table_collection_free(&tables); /* Edge table */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows, 1); ret_id = tsk_edge_table_add_row(&tables.edges, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /*Extending by a small amount results in 1024 rows in the first case*/ ret = tsk_edge_table_extend(&tables.edges, &tables2.edges, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows, 1024); /*Extending by an amount that fits doesn't grow the table*/ ret = tsk_edge_table_extend(&tables.edges, &tables2.edges, 1023, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows, 1024); /*Extending by an amount that doesn't fit doubles the table*/ ret = tsk_edge_table_extend(&tables.edges, &tables2.edges, 1024, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows, 2048); /*Extending by an amount greater than the next double extends to that amount*/ ret = tsk_edge_table_extend(&tables.edges, &tables2.edges, 4096, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows, 4097); /*After extending beyond 2^21 subsequent extension doesn't double but adds 2^21*/ ret = tsk_edge_table_extend(&tables.edges, &tables2.edges, 2097152, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows, 2097153); ret = tsk_edge_table_extend(&tables.edges, &tables2.edges, 2097154, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows, 4194305); /*Extending by more rows than possible results in overflow*/ ret = tsk_edge_table_extend(&tables.edges, &tables2.edges, TSK_MAX_ID, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows, 4194305); /*Setting a custom extension uses that*/ ret = tsk_edge_table_set_max_rows_increment(&tables.edges, 42); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_extend(&tables.edges, &tables2.edges, 4194305, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows, 4194305 + 42); /*Setting a custom extension that overflows errors*/ ret = tsk_edge_table_set_max_rows_increment(&tables.edges, TSK_MAX_ID); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_extend( &tables.edges, &tables2.edges, 4194305 + 42 + 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.edges.max_rows, 4194305 + 42); tsk_table_collection_free(&tables); /* Migration table */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows, 1); ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 0, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /*Extending by a small amount results in 1024 rows in the first case*/ ret = tsk_migration_table_extend( &tables.migrations, &tables2.migrations, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows, 1024); /*Extending by an amount that fits doesn't grow the table*/ ret = tsk_migration_table_extend( &tables.migrations, &tables2.migrations, 1023, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows, 1024); /*Extending by an amount that doesn't fit doubles the table*/ ret = tsk_migration_table_extend( &tables.migrations, &tables2.migrations, 1024, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows, 2048); /*Extending by an amount greater than the next double extends to that amount*/ ret = tsk_migration_table_extend( &tables.migrations, &tables2.migrations, 4096, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows, 4097); /*After extending beyond 2^21 subsequent extension doesn't double but adds 2^21*/ ret = tsk_migration_table_extend( &tables.migrations, &tables2.migrations, 2097152, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows, 2097153); ret = tsk_migration_table_extend( &tables.migrations, &tables2.migrations, 2097154, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows, 4194305); /*Extending by more rows than possible results in overflow*/ ret = tsk_migration_table_extend( &tables.migrations, &tables2.migrations, TSK_MAX_ID, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows, 4194305); /*Setting a custom extension uses that*/ ret = tsk_migration_table_set_max_rows_increment(&tables.migrations, 42); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_extend( &tables.migrations, &tables2.migrations, 4194305, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows, 4194305 + 42); /*Setting a custom extension that overflows errors*/ ret = tsk_migration_table_set_max_rows_increment(&tables.migrations, TSK_MAX_ID); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_extend( &tables.migrations, &tables2.migrations, 4194305 + 42 + 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.migrations.max_rows, 4194305 + 42); tsk_table_collection_free(&tables); /* Site table */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows, 1); ret_id = tsk_site_table_add_row(&tables.sites, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /*Extending by a small amount results in 1024 rows in the first case*/ ret = tsk_site_table_extend(&tables.sites, &tables2.sites, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows, 1024); /*Extending by an amount that fits doesn't grow the table*/ ret = tsk_site_table_extend(&tables.sites, &tables2.sites, 1023, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows, 1024); /*Extending by an amount that doesn't fit doubles the table*/ ret = tsk_site_table_extend(&tables.sites, &tables2.sites, 1024, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows, 2048); /*Extending by an amount greater than the next double extends to that amount*/ ret = tsk_site_table_extend(&tables.sites, &tables2.sites, 4096, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows, 4097); /*After extending beyond 2^21 subsequent extension doesn't double but adds 2^21*/ ret = tsk_site_table_extend(&tables.sites, &tables2.sites, 2097152, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows, 2097153); ret = tsk_site_table_extend(&tables.sites, &tables2.sites, 2097154, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows, 4194305); /*Extending by more rows than possible results in overflow*/ ret = tsk_site_table_extend(&tables.sites, &tables2.sites, TSK_MAX_ID, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows, 4194305); /*Setting a custom extension uses that*/ ret = tsk_site_table_set_max_rows_increment(&tables.sites, 42); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_extend(&tables.sites, &tables2.sites, 4194305, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows, 4194305 + 42); /*Setting a custom extension that overflows errors*/ ret = tsk_site_table_set_max_rows_increment(&tables.sites, TSK_MAX_ID); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_extend( &tables.sites, &tables2.sites, 4194305 + 42 + 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.sites.max_rows, 4194305 + 42); tsk_table_collection_free(&tables); /* Mutation table */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows, 1); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /*Extending by a small amount results in 1024 rows in the first case*/ ret = tsk_mutation_table_extend(&tables.mutations, &tables2.mutations, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows, 1024); /*Extending by an amount that fits doesn't grow the table*/ ret = tsk_mutation_table_extend( &tables.mutations, &tables2.mutations, 1023, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows, 1024); /*Extending by an amount that doesn't fit doubles the table*/ ret = tsk_mutation_table_extend( &tables.mutations, &tables2.mutations, 1024, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows, 2048); /*Extending by an amount greater than the next double extends to that amount*/ ret = tsk_mutation_table_extend( &tables.mutations, &tables2.mutations, 4096, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows, 4097); /*After extending beyond 2^21 subsequent extension doesn't double but adds 2^21*/ ret = tsk_mutation_table_extend( &tables.mutations, &tables2.mutations, 2097152, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows, 2097153); ret = tsk_mutation_table_extend( &tables.mutations, &tables2.mutations, 2097154, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows, 4194305); /*Extending by more rows than possible results in overflow*/ ret = tsk_mutation_table_extend( &tables.mutations, &tables2.mutations, TSK_MAX_ID, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows, 4194305); /*Setting a custom extension uses that*/ ret = tsk_mutation_table_set_max_rows_increment(&tables.mutations, 42); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_extend( &tables.mutations, &tables2.mutations, 4194305, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows, 4194305 + 42); /*Setting a custom extension that overflows errors*/ ret = tsk_mutation_table_set_max_rows_increment(&tables.mutations, TSK_MAX_ID); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_extend( &tables.mutations, &tables2.mutations, 4194305 + 42 + 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.mutations.max_rows, 4194305 + 42); tsk_table_collection_free(&tables); /* Population table */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows, 1); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /*Extending by a small amount results in 1024 rows in the first case*/ ret = tsk_population_table_extend( &tables.populations, &tables2.populations, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows, 1024); /*Extending by an amount that fits doesn't grow the table*/ ret = tsk_population_table_extend( &tables.populations, &tables2.populations, 1023, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows, 1024); /*Extending by an amount that doesn't fit doubles the table*/ ret = tsk_population_table_extend( &tables.populations, &tables2.populations, 1024, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows, 2048); /*Extending by an amount greater than the next double extends to that amount*/ ret = tsk_population_table_extend( &tables.populations, &tables2.populations, 4096, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows, 4097); /*After extending beyond 2^21 subsequent extension doesn't double but adds 2^21*/ ret = tsk_population_table_extend( &tables.populations, &tables2.populations, 2097152, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows, 2097153); ret = tsk_population_table_extend( &tables.populations, &tables2.populations, 2097154, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows, 4194305); /*Extending by more rows than possible results in overflow*/ ret = tsk_population_table_extend( &tables.populations, &tables2.populations, TSK_MAX_ID, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows, 4194305); /*Setting a custom extension uses that*/ ret = tsk_population_table_set_max_rows_increment(&tables.populations, 42); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_extend( &tables.populations, &tables2.populations, 4194305, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows, 4194305 + 42); /*Setting a custom extension that overflows errors*/ ret = tsk_population_table_set_max_rows_increment(&tables.populations, TSK_MAX_ID); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_extend( &tables.populations, &tables2.populations, 4194305 + 42 + 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.populations.max_rows, 4194305 + 42); tsk_table_collection_free(&tables); /* Provenance table */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows, 1); ret_id = tsk_provenance_table_add_row(&tables.provenances, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /*Extending by a small amount results in 1024 rows in the first case*/ ret = tsk_provenance_table_extend( &tables.provenances, &tables2.provenances, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows, 1024); /*Extending by an amount that fits doesn't grow the table*/ ret = tsk_provenance_table_extend( &tables.provenances, &tables2.provenances, 1023, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows, 1024); /*Extending by an amount that doesn't fit doubles the table*/ ret = tsk_provenance_table_extend( &tables.provenances, &tables2.provenances, 1024, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows, 2048); /*Extending by an amount greater than the next double extends to that amount*/ ret = tsk_provenance_table_extend( &tables.provenances, &tables2.provenances, 4096, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows, 4097); /*After extending beyond 2^21 subsequent extension doesn't double but adds 2^21*/ ret = tsk_provenance_table_extend( &tables.provenances, &tables2.provenances, 2097152, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows, 2097153); ret = tsk_provenance_table_extend( &tables.provenances, &tables2.provenances, 2097154, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows, 4194305); /*Extending by more rows than possible results in overflow*/ ret = tsk_provenance_table_extend( &tables.provenances, &tables2.provenances, TSK_MAX_ID, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows, 4194305); /*Setting a custom extension uses that*/ ret = tsk_provenance_table_set_max_rows_increment(&tables.provenances, 42); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_extend( &tables.provenances, &tables2.provenances, 4194305, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows, 4194305 + 42); /*Setting a custom extension that overflows errors*/ ret = tsk_provenance_table_set_max_rows_increment(&tables.provenances, TSK_MAX_ID); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_provenance_table_extend( &tables.provenances, &tables2.provenances, 4194305 + 42 + 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.provenances.max_rows, 4194305 + 42); tsk_table_collection_free(&tables); tsk_table_collection_free(&tables2); } static void test_ragged_expansion(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; char *data = tsk_malloc(104857600 * sizeof(char)); /* Test with node table metadata */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length, 1); /*Extending by a small amount results in 65536 bytes in the first case*/ ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, data, 2); CU_ASSERT_EQUAL_FATAL(ret_id, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length, 65536); /*Extending by an amount that fits doesn't grow the column*/ ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, data, 65534); CU_ASSERT_EQUAL_FATAL(ret_id, 1); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length, 65536); /*Extending by an amount that doesn't fit doubles the column*/ ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, data, 1); CU_ASSERT_EQUAL_FATAL(ret_id, 2); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length, 65536 * 2); /*Extending by an amount greater than the next double extends to that amount*/ ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, data, 1 + (65536 * 2 * 2 - 2 - 65534 - 1)); CU_ASSERT_EQUAL_FATAL(ret_id, 3); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length, 2 + 65534 + 1 + 196608); /*After extending beyond 100MB subsequent extension doesn't double but adds 100MB*/ ret_id = tsk_node_table_add_row( &tables.nodes, 0, 0, TSK_NULL, TSK_NULL, data, 104857600); CU_ASSERT_EQUAL_FATAL(ret_id, 4); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length, 105119745); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, data, 1); CU_ASSERT_EQUAL_FATAL(ret_id, 5); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length, 105119745 + 104857600); /*Extending by more bytes than possible results in overflow*/ ret_id = tsk_node_table_add_row( &tables.nodes, 0, 0, TSK_NULL, TSK_NULL, data, TSK_MAX_SIZE); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length, 105119745 + 104857600); tsk_node_table_free(&tables.nodes); ret = tsk_node_table_init(&tables.nodes, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /*Setting a custom extension uses that*/ ret = tsk_node_table_set_max_metadata_length_increment(&tables.nodes, 42); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, data, 3); CU_ASSERT_EQUAL_FATAL(ret_id, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length, 43); /*Setting a custom extension that overflows errors*/ ret = tsk_node_table_set_max_metadata_length_increment(&tables.nodes, TSK_MAX_SIZE); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, data, 41); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); CU_ASSERT_EQUAL_FATAL(tables.nodes.max_metadata_length, 43); tsk_table_collection_free(&tables); tsk_safe_free(data); } static void test_link_ancestors_input_errors(void) { int ret; tsk_id_t ret_id; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_edge_table_t result; tsk_id_t samples[] = { 0, 1 }; tsk_id_t ancestors[] = { 4, 6 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add an edge with some metadata */ ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 7); ret_id = tsk_edge_table_add_row(&tables.edges, 0, 1, 7, 6, "metadata", 8); CU_ASSERT_FATAL(ret_id > 0); ret = tsk_edge_table_init(&result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_link_ancestors( &tables, NULL, 2, ancestors, 2, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); tsk_edge_table_free(&result); tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_init(&result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_link_ancestors( &tables, NULL, 2, ancestors, 2, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Bad sample IDs */ samples[0] = -1; ret = tsk_table_collection_link_ancestors( &tables, samples, 2, ancestors, 2, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); /* Bad ancestor IDs */ samples[0] = 0; ancestors[0] = -1; ret = tsk_table_collection_link_ancestors( &tables, samples, 2, ancestors, 2, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); /* Duplicate sample IDs */ ancestors[0] = 4; samples[0] = 1; ret = tsk_table_collection_link_ancestors( &tables, samples, 2, ancestors, 2, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); /* Duplicate sample IDs */ ancestors[0] = 6; samples[0] = 0; ret = tsk_table_collection_link_ancestors( &tables, samples, 2, ancestors, 2, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); /* TODO more tests! */ tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); tsk_edge_table_free(&result); } static void test_link_ancestors_single_tree(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_edge_table_t result; tsk_id_t samples[] = { 0, 1 }; tsk_id_t ancestors[] = { 4, 6 }; size_t i; double res_left = 0; double res_right = 1; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_init(&result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_link_ancestors( &tables, samples, 2, ancestors, 2, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); // Check we get the right result. CU_ASSERT_EQUAL(result.num_rows, 3); tsk_id_t res_parent[] = { 4, 4, 6 }; tsk_id_t res_child[] = { 0, 1, 4 }; for (i = 0; i < result.num_rows; i++) { CU_ASSERT_EQUAL(res_parent[i], result.parent[i]); CU_ASSERT_EQUAL(res_child[i], result.child[i]); CU_ASSERT_EQUAL(res_left, result.left[i]); CU_ASSERT_EQUAL(res_right, result.right[i]); } tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); tsk_edge_table_free(&result); } static void test_link_ancestors_no_edges(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_edge_table_t result; tsk_id_t samples[] = { 2 }; tsk_id_t ancestors[] = { 4 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_init(&result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_link_ancestors( &tables, samples, 1, ancestors, 1, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tables); tsk_edge_table_free(&result); tsk_treeseq_free(&ts); } static void test_link_ancestors_samples_and_ancestors_overlap(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_edge_table_t result; tsk_id_t samples[] = { 0, 1, 2, 4 }; tsk_id_t ancestors[] = { 2 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_init(&result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_link_ancestors( &tables, samples, 4, ancestors, 1, 0, &result); // tsk_edge_table_print_state(&result, stdout); CU_ASSERT_EQUAL_FATAL(ret, 0); // Check we get the right result. CU_ASSERT_EQUAL(result.num_rows, 2); size_t i; tsk_id_t res_parent = 4; tsk_id_t res_child[] = { 0, 1 }; double res_left = 0; double res_right = 1; for (i = 0; i < result.num_rows; i++) { CU_ASSERT_EQUAL(res_parent, result.parent[i]); CU_ASSERT_EQUAL(res_child[i], result.child[i]); CU_ASSERT_EQUAL(res_left, result.left[i]); CU_ASSERT_EQUAL(res_right, result.right[i]); } tsk_table_collection_free(&tables); tsk_edge_table_free(&result); tsk_treeseq_free(&ts); } static void test_link_ancestors_paper(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_edge_table_t result; tsk_id_t samples[] = { 0, 1, 2 }; tsk_id_t ancestors[] = { 5, 6, 7 }; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_init(&result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_link_ancestors( &tables, samples, 3, ancestors, 3, 0, &result); // tsk_edge_table_print_state(&result, stdout); // Check we get the right result. CU_ASSERT_EQUAL(result.num_rows, 6); size_t i; tsk_id_t res_parent[] = { 5, 5, 6, 6, 7, 7 }; tsk_id_t res_child[] = { 1, 2, 0, 5, 0, 5 }; double res_left[] = { 0, 2, 0, 0, 7, 7 }; double res_right[] = { 10, 10, 7, 7, 10, 10 }; for (i = 0; i < result.num_rows; i++) { CU_ASSERT_EQUAL(res_parent[i], result.parent[i]); CU_ASSERT_EQUAL(res_child[i], result.child[i]); CU_ASSERT_EQUAL(res_left[i], result.left[i]); CU_ASSERT_EQUAL(res_right[i], result.right[i]); } tsk_table_collection_free(&tables); tsk_edge_table_free(&result); tsk_treeseq_free(&ts); } static void test_link_ancestors_multiple_to_single_tree(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_edge_table_t result; tsk_id_t samples[] = { 1, 3 }; tsk_id_t ancestors[] = { 5 }; size_t i; tsk_id_t res_parent = 5; tsk_id_t res_child[] = { 1, 3 }; double res_left = 0; double res_right = 10; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_init(&result, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_link_ancestors( &tables, samples, 2, ancestors, 1, 0, &result); CU_ASSERT_EQUAL(result.num_rows, 2); for (i = 0; i < result.num_rows; i++) { CU_ASSERT_EQUAL(res_parent, result.parent[i]); CU_ASSERT_EQUAL(res_child[i], result.child[i]); CU_ASSERT_EQUAL(res_left, result.left[i]); CU_ASSERT_EQUAL(res_right, result.right[i]); } tsk_table_collection_free(&tables); tsk_edge_table_free(&result); tsk_treeseq_free(&ts); } static void verify_ibd_segment_list(tsk_identity_segment_list_t *list, tsk_size_t num_nodes) { tsk_identity_segment_t *seg; double total_span = 0; tsk_size_t num_segments = 0; /* double last_right = 0; */ for (seg = list->head; seg != NULL; seg = seg->next) { CU_ASSERT_FATAL(seg->left < seg->right); CU_ASSERT_FATAL(seg->node >= 0); CU_ASSERT_FATAL(seg->node < (tsk_id_t) num_nodes); total_span += seg->right - seg->left; num_segments++; /* TODO the segments are not necessarily in order - issue #1682 */ /* CU_ASSERT_FATAL(seg->left >= last_right); */ /* last_right = seg->right; */ } CU_ASSERT_EQUAL_FATAL(total_span, list->total_span); CU_ASSERT_EQUAL_FATAL(num_segments, list->num_segments); } static void verify_ibd_result(tsk_identity_segments_t *result) { int ret; tsk_size_t j; tsk_id_t a, b; int64_t index; tsk_size_t total_segments = 0; double total_span = 0; tsk_size_t num_pairs = tsk_identity_segments_get_num_pairs(result); tsk_id_t *pairs = tsk_malloc(2 * tsk_identity_segments_get_num_pairs(result) * sizeof(*pairs)); tsk_id_t *pairs2 = tsk_malloc(2 * tsk_identity_segments_get_num_pairs(result) * sizeof(*pairs)); tsk_identity_segment_list_t **lists = tsk_malloc(tsk_identity_segments_get_num_pairs(result) * sizeof(*lists)); tsk_avl_node_int_t **avl_nodes = tsk_malloc(result->pair_map.size * sizeof(*avl_nodes)); CU_ASSERT_FATAL(pairs != NULL); CU_ASSERT_FATAL(pairs2 != NULL); CU_ASSERT_FATAL(avl_nodes != NULL); CU_ASSERT_FATAL(lists != NULL); CU_ASSERT_EQUAL_FATAL(num_pairs, result->pair_map.size); tsk_identity_segments_print_state(result, _devnull); ret = tsk_identity_segments_get_keys(result, pairs); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_avl_tree_int_ordered_nodes(&result->pair_map, avl_nodes); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_pairs; j++) { a = pairs[2 * j]; b = pairs[2 * j + 1]; index = a * (int64_t) result->num_nodes + b; CU_ASSERT(a < b); CU_ASSERT_EQUAL(tsk_avl_tree_int_search(&result->pair_map, index), avl_nodes[j]); index = b * (int64_t) result->num_nodes + a; CU_ASSERT_EQUAL(tsk_avl_tree_int_search(&result->pair_map, index), NULL); } ret = tsk_identity_segments_get_items(result, pairs2, lists); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_pairs; j++) { CU_ASSERT_EQUAL_FATAL(pairs[2 * j], pairs2[2 * j]); CU_ASSERT_EQUAL_FATAL(pairs[2 * j + 1], pairs2[2 * j + 1]); verify_ibd_segment_list(lists[j], result->num_nodes); total_segments += lists[j]->num_segments; total_span += lists[j]->total_span; } CU_ASSERT_EQUAL_FATAL(result->num_segments, total_segments); CU_ASSERT_DOUBLE_EQUAL(result->total_span, total_span, 1e-6); free(pairs); free(pairs2); free(lists); free(avl_nodes); } static void test_ibd_segments_debug(void) { tsk_treeseq_t ts; int ret; tsk_identity_segments_t result; tsk_size_t sizes[] = { 2, 2 }; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); tsk_set_debug_stream(_devnull); /* Run the DEBUG code */ ret = tsk_table_collection_ibd_within( ts.tables, &result, NULL, 0, 0.0, DBL_MAX, TSK_DEBUG); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_identity_segments_free(&result); ret = tsk_table_collection_ibd_between( ts.tables, &result, 2, sizes, samples, 0.0, DBL_MAX, TSK_DEBUG); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_identity_segments_free(&result); ret = tsk_table_collection_ibd_within( ts.tables, &result, NULL, 0, 0.0, DBL_MAX, TSK_DEBUG | TSK_IBD_STORE_PAIRS); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_identity_segments_free(&result); ret = tsk_table_collection_ibd_within( ts.tables, &result, NULL, 0, 0.0, DBL_MAX, TSK_DEBUG | TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_identity_segments_free(&result); tsk_set_debug_stream(stdout); tsk_treeseq_free(&ts); } static void test_ibd_segments_caterpillar_tree(void) { int ret; tsk_identity_segments_t result; tsk_treeseq_t *ts = caterpillar_tree(100, 1, 5); /* We're just testing out the memory expansion in ibd_finder */ ret = tsk_table_collection_ibd_within(ts->tables, &result, NULL, 0, 0.0, DBL_MAX, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_identity_segments_free(&result); tsk_treeseq_free(ts); free(ts); } static void test_ibd_segments_single_tree(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1 }; tsk_size_t sizes[] = { 1, 1 }; tsk_identity_segments_t result; tsk_identity_segment_list_t *list = NULL; tsk_identity_segment_t *seg = NULL; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Only get IBD segs for (0, 1) */ ret = tsk_table_collection_ibd_within( &tables, &result, samples, 2, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_identity_segments_get(&result, samples[0], samples[1], &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(list != NULL); seg = list->head; CU_ASSERT_EQUAL_FATAL(seg->next, NULL); CU_ASSERT_EQUAL_FATAL(seg->left, 0); CU_ASSERT_EQUAL_FATAL(seg->right, 1); CU_ASSERT_EQUAL_FATAL(seg->node, 4); /* Queries for other sample pairs fail */ ret = tsk_identity_segments_get(&result, 0, 2, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(list, NULL); ret = tsk_identity_segments_get(&result, 1, 3, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(list, NULL); tsk_identity_segments_print_state(&result, _devnull); CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_num_segments(&result), 1); CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_total_span(&result), 1); verify_ibd_result(&result); tsk_identity_segments_free(&result); /* Get IBD segs among all pairs of samples */ ret = tsk_table_collection_ibd_within( &tables, &result, NULL, 0, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); /* We have 4 samples, so 4 choose 2 sample pairs */ CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_num_segments(&result), 6); CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_total_span(&result), 6); ret = tsk_identity_segments_get(&result, 0, 1, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); seg = list->head; CU_ASSERT_FATAL(seg != NULL); CU_ASSERT_EQUAL_FATAL(seg->next, NULL); CU_ASSERT_EQUAL_FATAL(seg->left, 0); CU_ASSERT_EQUAL_FATAL(seg->right, 1); CU_ASSERT_EQUAL_FATAL(seg->node, 4); ret = tsk_identity_segments_get(&result, 3, 0, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); seg = list->head; CU_ASSERT_FATAL(seg != NULL); CU_ASSERT_EQUAL_FATAL(seg->next, NULL); CU_ASSERT_EQUAL_FATAL(seg->left, 0); CU_ASSERT_EQUAL_FATAL(seg->right, 1); CU_ASSERT_EQUAL_FATAL(seg->node, 6); verify_ibd_result(&result); tsk_identity_segments_free(&result); /* Get segs between {0} and {1} */ ret = tsk_table_collection_ibd_between( ts.tables, &result, 2, sizes, samples, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_ibd_result(&result); CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_num_segments(&result), 1); ret = tsk_identity_segments_get(&result, 0, 1, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); seg = list->head; CU_ASSERT_FATAL(seg != NULL); CU_ASSERT_EQUAL_FATAL(seg->next, NULL); CU_ASSERT_EQUAL_FATAL(seg->left, 0); CU_ASSERT_EQUAL_FATAL(seg->right, 1); CU_ASSERT_EQUAL_FATAL(seg->node, 4); tsk_identity_segments_free(&result); /* within an empty list gives no segments */ ret = tsk_table_collection_ibd_within(&tables, &result, samples, 0, 0.0, DBL_MAX, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_num_segments(&result), 0); tsk_identity_segments_free(&result); /* Between an empty list gives no segments */ ret = tsk_table_collection_ibd_between( ts.tables, &result, 0, sizes, samples, 0.0, DBL_MAX, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_num_segments(&result), 0); tsk_identity_segments_free(&result); /* Between one empty list gives no segments*/ sizes[0] = 0; ret = tsk_table_collection_ibd_between( ts.tables, &result, 2, sizes, samples, 0.0, DBL_MAX, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_num_segments(&result), 0); tsk_identity_segments_free(&result); sizes[0] = 2; tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_ibd_segments_single_tree_options(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_identity_segments_t result; tsk_identity_segment_list_t *list = NULL; tsk_id_t pairs[12]; tsk_identity_segment_list_t *lists[6]; tsk_flags_t options[2]; int k; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_ibd_within(&tables, &result, NULL, 0, 0.0, DBL_MAX, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* We have 4 samples, so 4 choose 2 sample pairs */ CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_num_segments(&result), 6); CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_total_span(&result), 6); /* out-of-bounds is still detected */ ret = tsk_identity_segments_get(&result, 0, 100, &list); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); /* By default all specific queries fail on the ibd_segments result */ ret = tsk_identity_segments_get(&result, 0, 1, &list); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IBD_PAIRS_NOT_STORED); ret = tsk_identity_segments_get_keys(&result, pairs); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IBD_PAIRS_NOT_STORED); ret = tsk_identity_segments_get_items(&result, pairs, lists); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IBD_PAIRS_NOT_STORED); tsk_identity_segments_free(&result); ret = tsk_table_collection_ibd_within( &tables, &result, NULL, 0, 0.0, DBL_MAX, TSK_IBD_STORE_PAIRS); CU_ASSERT_EQUAL_FATAL(ret, 0); /* out-of-bounds is still detected */ ret = tsk_identity_segments_get(&result, 0, 100, &list); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); /* Getters for the lists now work, but the lists themselves are NULL */ ret = tsk_identity_segments_get(&result, 0, 1, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(list->head, NULL); CU_ASSERT_EQUAL_FATAL(list->total_span, 1); CU_ASSERT_EQUAL_FATAL(list->num_segments, 1); ret = tsk_identity_segments_get_keys(&result, pairs); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(pairs[0], 0); CU_ASSERT_EQUAL_FATAL(pairs[1], 1); ret = tsk_identity_segments_get_items(&result, pairs, lists); CU_ASSERT_EQUAL_FATAL(pairs[0], 0); CU_ASSERT_EQUAL_FATAL(pairs[1], 1); CU_ASSERT_EQUAL_FATAL(lists[0]->head, NULL); CU_ASSERT_EQUAL_FATAL(lists[0]->total_span, 1); CU_ASSERT_EQUAL_FATAL(lists[0]->num_segments, 1); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_identity_segments_free(&result); /* store_segments implies store_pairs */ options[0] = TSK_IBD_STORE_SEGMENTS; options[1] = TSK_IBD_STORE_PAIRS | TSK_IBD_STORE_SEGMENTS; for (k = 0; k < 2; k++) { ret = tsk_table_collection_ibd_within( &tables, &result, NULL, 0, 0.0, DBL_MAX, options[k]); CU_ASSERT_EQUAL_FATAL(ret, 0); /* out-of-bounds is still detected */ ret = tsk_identity_segments_get(&result, 0, 100, &list); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_identity_segments_get(&result, 0, 1, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(list->head != NULL); CU_ASSERT_EQUAL_FATAL(list->head->left, 0); CU_ASSERT_EQUAL_FATAL(list->head->right, 1); CU_ASSERT_EQUAL_FATAL(list->head->next, NULL); CU_ASSERT_EQUAL_FATAL(list->total_span, 1); CU_ASSERT_EQUAL_FATAL(list->num_segments, 1); ret = tsk_identity_segments_get_keys(&result, pairs); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(pairs[0], 0); CU_ASSERT_EQUAL_FATAL(pairs[1], 1); ret = tsk_identity_segments_get_items(&result, pairs, lists); CU_ASSERT_EQUAL_FATAL(pairs[0], 0); CU_ASSERT_EQUAL_FATAL(pairs[1], 1); CU_ASSERT_FATAL(lists[0]->head != NULL); CU_ASSERT_EQUAL_FATAL(lists[0]->head->left, 0); CU_ASSERT_EQUAL_FATAL(lists[0]->head->right, 1); CU_ASSERT_EQUAL_FATAL(lists[0]->head->next, NULL); CU_ASSERT_EQUAL_FATAL(lists[0]->total_span, 1); CU_ASSERT_EQUAL_FATAL(lists[0]->num_segments, 1); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_identity_segments_free(&result); } tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_ibd_segments_single_tree_between(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t sizes[] = { 2, 2 }; tsk_identity_segments_t result; tsk_identity_segment_list_t *list = NULL; tsk_identity_segment_t *seg = NULL; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Get segs between {0, 1} and {2, 3} */ ret = tsk_table_collection_ibd_between( ts.tables, &result, 2, sizes, samples, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_ibd_result(&result); CU_ASSERT_EQUAL_FATAL(tsk_identity_segments_get_num_segments(&result), 4); ret = tsk_identity_segments_get(&result, 0, 2, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); seg = list->head; CU_ASSERT_FATAL(seg != NULL); CU_ASSERT_EQUAL_FATAL(seg->next, NULL); CU_ASSERT_EQUAL_FATAL(seg->left, 0); CU_ASSERT_EQUAL_FATAL(seg->right, 1); CU_ASSERT_EQUAL_FATAL(seg->node, 6); ret = tsk_identity_segments_get(&result, 0, 3, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); seg = list->head; CU_ASSERT_FATAL(seg != NULL); CU_ASSERT_EQUAL_FATAL(seg->next, NULL); CU_ASSERT_EQUAL_FATAL(seg->left, 0); CU_ASSERT_EQUAL_FATAL(seg->right, 1); CU_ASSERT_EQUAL_FATAL(seg->node, 6); ret = tsk_identity_segments_get(&result, 1, 2, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); seg = list->head; CU_ASSERT_FATAL(seg != NULL); CU_ASSERT_EQUAL_FATAL(seg->next, NULL); CU_ASSERT_EQUAL_FATAL(seg->left, 0); CU_ASSERT_EQUAL_FATAL(seg->right, 1); CU_ASSERT_EQUAL_FATAL(seg->node, 6); ret = tsk_identity_segments_get(&result, 1, 3, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); seg = list->head; CU_ASSERT_FATAL(seg != NULL); CU_ASSERT_EQUAL_FATAL(seg->next, NULL); CU_ASSERT_EQUAL_FATAL(seg->left, 0); CU_ASSERT_EQUAL_FATAL(seg->right, 1); CU_ASSERT_EQUAL_FATAL(seg->node, 6); tsk_identity_segments_free(&result); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_ibd_segments_multiple_trees(void) { int ret; tsk_size_t j, k; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1, 2 }; tsk_id_t pairs[][2] = { { 0, 1 }, { 0, 2 } }; tsk_size_t num_samples = 3; tsk_size_t num_pairs = 2; tsk_identity_segments_t result; double true_left[2][2] = { { 0.0, 0.75 }, { 0.75, 0.0 } }; double true_right[2][2] = { { 0.75, 1.0 }, { 1.0, 0.75 } }; double true_node[2][2] = { { 4, 5 }, { 5, 6 } }; tsk_identity_segment_list_t *list; tsk_identity_segment_t *seg; tsk_treeseq_from_text(&ts, 2, multiple_tree_ex_nodes, multiple_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_ibd_within( &tables, &result, samples, num_samples, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_pairs; j++) { ret = tsk_identity_segments_get(&result, pairs[j][0], pairs[j][1], &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(list->num_segments, 2); k = 0; for (seg = list->head; seg != NULL; seg = seg->next) { CU_ASSERT_EQUAL_FATAL(seg->left, true_left[j][k]); CU_ASSERT_EQUAL_FATAL(seg->right, true_right[j][k]); CU_ASSERT_EQUAL_FATAL(seg->node, true_node[j][k]); k++; } CU_ASSERT_EQUAL_FATAL(list->num_segments, k); } verify_ibd_result(&result); tsk_identity_segments_free(&result); ret = tsk_table_collection_ibd_within( &tables, &result, NULL, 0, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_ibd_result(&result); tsk_identity_segments_free(&result); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_ibd_segments_empty_result(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1 }; tsk_identity_segments_t result; tsk_identity_segment_list_t *list; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_ibd_within( &tables, &result, samples, 1, 0.0, 0.5, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_identity_segments_get(&result, samples[0], samples[1], &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(list == NULL); verify_ibd_result(&result); tsk_identity_segments_free(&result); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_ibd_segments_min_span_max_time(void) { int ret; tsk_treeseq_t ts; tsk_identity_segments_t result; tsk_identity_segment_list_t *list; tsk_identity_segment_t *seg; tsk_treeseq_from_text(&ts, 2, multiple_tree_ex_nodes, multiple_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_table_collection_ibd_within( ts.tables, &result, NULL, 0, 0.5, 3.0, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_identity_segments_get(&result, 0, 1, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(list->num_segments, 1); seg = list->head; CU_ASSERT_EQUAL_FATAL(seg->left, 0.0); CU_ASSERT_EQUAL_FATAL(seg->right, 0.75); CU_ASSERT_EQUAL_FATAL(seg->node, 4); ret = tsk_identity_segments_get(&result, 1, 2, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(list, NULL); ret = tsk_identity_segments_get(&result, 0, 2, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(list, NULL); verify_ibd_result(&result); tsk_identity_segments_free(&result); tsk_treeseq_free(&ts); } static void test_ibd_segments_errors(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1, 2 }; tsk_id_t duplicate_samples[] = { 0, 1, 0 }; tsk_id_t samples2[] = { -1, 1 }; tsk_size_t sample_set_sizes[] = { 3 }; tsk_identity_segments_t result; tsk_identity_segment_list_t *list; tsk_treeseq_from_text(&ts, 2, multiple_tree_ex_nodes, multiple_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // Invalid sample IDs ret = tsk_table_collection_ibd_within( &tables, &result, samples2, 1, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_identity_segments_free(&result); ret = tsk_table_collection_ibd_between(&tables, &result, 1, sample_set_sizes, samples2, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_identity_segments_free(&result); // Bad length or time ret = tsk_table_collection_ibd_within(&tables, &result, samples, 2, 0.0, -1, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_identity_segments_free(&result); ret = tsk_table_collection_ibd_within(&tables, &result, samples, 2, -1, 0.0, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_identity_segments_free(&result); ret = tsk_table_collection_ibd_between(&tables, &result, 1, sample_set_sizes, samples, -1, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_identity_segments_free(&result); ret = tsk_table_collection_ibd_between( &tables, &result, 1, sample_set_sizes, samples, 0, -1, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_identity_segments_free(&result); // Duplicate samples ret = tsk_table_collection_ibd_within( &tables, &result, duplicate_samples, 3, 0.0, DBL_MAX, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); tsk_identity_segments_free(&result); ret = tsk_table_collection_ibd_between(&tables, &result, 1, sample_set_sizes, duplicate_samples, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); tsk_identity_segments_free(&result); // Check for bad inputs to result ret = tsk_table_collection_ibd_within( &tables, &result, NULL, 0, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_identity_segments_get(&result, 0, -1, &list); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_identity_segments_get(&result, -1, 0, &list); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_identity_segments_get(&result, 0, 100, &list); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_identity_segments_get(&result, 100, 0, &list); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_identity_segments_get(&result, 0, 5, &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(list, NULL); /* TODO add more checks here */ ret = tsk_identity_segments_get(&result, 0, 0, &list); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SAME_NODES_IN_PAIR); tsk_identity_segments_free(&result); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_ibd_segments_samples_are_descendants(void) { int ret; tsk_treeseq_t ts; tsk_id_t samples[] = { 0, 1, 2, 3, 4, 5 }; tsk_size_t num_samples = 6; tsk_identity_segments_t result; tsk_id_t pairs[][2] = { { 0, 2 }, { 0, 4 }, { 2, 4 }, { 1, 3 }, { 1, 5 }, { 3, 5 } }; tsk_size_t num_pairs = 6; tsk_id_t true_node[] = { 2, 4, 4, 3, 5, 5 }; tsk_size_t j; tsk_identity_segment_list_t *list; tsk_identity_segment_t *seg; tsk_treeseq_from_text(&ts, 1, multi_root_tree_ex_nodes, multi_root_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_table_collection_ibd_within( ts.tables, &result, samples, num_samples, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_pairs; j++) { tsk_identity_segments_get(&result, pairs[j][0], pairs[j][1], &list); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(list != NULL); CU_ASSERT_EQUAL_FATAL(list->num_segments, 1); seg = list->head; CU_ASSERT_EQUAL_FATAL(seg->left, 0); CU_ASSERT_EQUAL_FATAL(seg->right, 1); CU_ASSERT_EQUAL_FATAL(seg->node, true_node[j]); } verify_ibd_result(&result); tsk_identity_segments_free(&result); tsk_treeseq_free(&ts); } static void test_ibd_segments_multiple_ibd_paths(void) { int ret; tsk_size_t j, k; tsk_treeseq_t ts; tsk_id_t pairs[][2] = { { 0, 1 }, { 0, 2 }, { 1, 2 } }; tsk_size_t num_pairs = 3; tsk_identity_segments_t result; double true_left[3][2] = { { 0.2, 0.0 }, { 0.2, 0.0 }, { 0.0, 0.2 } }; double true_right[3][2] = { { 1.0, 0.2 }, { 1.0, 0.2 }, { 0.2, 1.0 } }; double true_node[3][2] = { { 4, 5 }, { 3, 5 }, { 4, 4 } }; tsk_identity_segment_list_t *list; tsk_identity_segment_t *seg; tsk_treeseq_from_text(&ts, 2, multi_path_tree_ex_nodes, multi_path_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_table_collection_ibd_within( ts.tables, &result, NULL, 0, 0.0, DBL_MAX, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_pairs; j++) { tsk_identity_segments_get(&result, pairs[j][0], pairs[j][1], &list); CU_ASSERT_EQUAL_FATAL(ret, 0); k = 0; for (seg = list->head; seg != NULL; seg = seg->next) { CU_ASSERT_EQUAL_FATAL(seg->left, true_left[j][k]); CU_ASSERT_EQUAL_FATAL(seg->right, true_right[j][k]); CU_ASSERT_EQUAL_FATAL(seg->node, true_node[j][k]); k++; } CU_ASSERT_EQUAL_FATAL(k, 2); } verify_ibd_result(&result); tsk_identity_segments_free(&result); tsk_treeseq_free(&ts); } static void test_ibd_segments_odd_topologies(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1 }; tsk_id_t samples1[] = { 0, 2 }; tsk_identity_segments_t result; tsk_treeseq_from_text( &ts, 1, odd_tree1_ex_nodes, odd_tree1_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // Multiple roots. ret = tsk_table_collection_ibd_within( &tables, &result, samples, 1, 0, 0, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_ibd_result(&result); tsk_identity_segments_free(&result); // Parent is a sample. ret = tsk_table_collection_ibd_within( &tables, &result, samples1, 1, 0, 0, TSK_IBD_STORE_SEGMENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_ibd_result(&result); tsk_identity_segments_free(&result); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_simplify_tables_drops_indexes(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_has_index(&tables, 0)) ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)) tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_simplify_empty_tables(void) { int ret; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret = tsk_table_collection_simplify(&tables, NULL, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 0); tsk_table_collection_free(&tables); } static void test_simplify_metadata(void) { int ret; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 10; tsk_edge_table_add_row(&tables.edges, 0, 0, 1, 1, "metadata", 8); ret = tsk_table_collection_simplify(&tables, NULL, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA); tsk_table_collection_free(&tables); } static void test_edge_update_invalidates_index(void) { int ret; tsk_id_t ret_id; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); /* Any operations on the edge table should now invalidate the index */ ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_has_index(&tables, 0)) ret = tsk_edge_table_clear(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)); /* Even though the actual indexes still exist */ CU_ASSERT_FALSE(tables.indexes.edge_insertion_order == NULL); CU_ASSERT_FALSE(tables.indexes.edge_removal_order == NULL); CU_ASSERT_EQUAL_FATAL(tables.indexes.num_edges, tsk_treeseq_get_num_edges(&ts)); ret = tsk_treeseq_copy_tables(&ts, &tables, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_has_index(&tables, 0)) ret_id = tsk_edge_table_add_row(&tables.edges, 0, 1, 0, 1, NULL, 0); CU_ASSERT_TRUE(ret_id > 0); CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)); /* Even though the actual indexes still exist */ CU_ASSERT_FALSE(tables.indexes.edge_insertion_order == NULL); CU_ASSERT_FALSE(tables.indexes.edge_removal_order == NULL); CU_ASSERT_EQUAL_FATAL(tables.indexes.num_edges, tsk_treeseq_get_num_edges(&ts)); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_copy_table_collection(void) { int ret; tsk_id_t ret_id; tsk_treeseq_t ts; tsk_table_collection_t tables, tables_copy; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add some migrations, population and provenance */ ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 1, 2, 3, 4, 5, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_migration_table_add_row(&tables.migrations, 1, 2, 3, 4, 5, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_population_table_add_row(&tables.populations, "metadata", 8); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_population_table_add_row(&tables.populations, "other", 5); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_provenance_table_add_row(&tables.provenances, "time", 4, "record", 6); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_provenance_table_add_row(&tables.provenances, "time ", 5, "record ", 7); CU_ASSERT_EQUAL_FATAL(ret_id, 1); tsk_table_collection_copy(&tables, &tables_copy, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tables, &tables_copy, 0)); tsk_table_collection_free(&tables); tsk_table_collection_free(&tables_copy); tsk_treeseq_free(&ts); } static void test_sort_tables_offsets(void) { int ret; tsk_treeseq_t *ts; tsk_table_collection_t tables, copy; tsk_bookmark_t bookmark; ts = caterpillar_tree(10, 5, 5); ret = tsk_treeseq_copy_tables(ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Check that setting edge offset = len(edges) does nothing */ reverse_edges(&tables); ret = tsk_table_collection_copy(&tables, ©, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memset(&bookmark, 0, sizeof(bookmark)); bookmark.edges = tables.edges.num_rows; ret = tsk_table_collection_sort(&tables, &bookmark, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Check that setting migration offset = len(migrations) does nothing */ reverse_migrations(&tables); ret = tsk_table_collection_copy(&tables, ©, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memset(&bookmark, 0, sizeof(bookmark)); bookmark.migrations = tables.migrations.num_rows; ret = tsk_table_collection_sort(&tables, &bookmark, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tables.sites.num_rows > 2); CU_ASSERT_FATAL(tables.mutations.num_rows > 2); /* Check that setting mutation and site offset = to the len * of the tables leaves them untouched. */ reverse_mutations(&tables); /* Swap the positions of the first two sites, as a quick way * to disorder the site table */ tables.sites.position[0] = tables.sites.position[1]; tables.sites.position[1] = 0; ret = tsk_table_collection_copy(&tables, ©, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memset(&bookmark, 0, sizeof(bookmark)); bookmark.sites = tables.sites.num_rows; bookmark.mutations = tables.mutations.num_rows; ret = tsk_table_collection_sort(&tables, &bookmark, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); /* Anything other than len(table) leads to an error for sites * and mutations, and we can't specify one without the other. */ tsk_memset(&bookmark, 0, sizeof(bookmark)); bookmark.sites = tables.sites.num_rows; ret = tsk_table_collection_sort(&tables, &bookmark, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); tsk_memset(&bookmark, 0, sizeof(bookmark)); bookmark.mutations = tables.mutations.num_rows; ret = tsk_table_collection_sort(&tables, &bookmark, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); tsk_memset(&bookmark, 0, sizeof(bookmark)); bookmark.sites = tables.sites.num_rows - 1; bookmark.mutations = tables.mutations.num_rows - 1; ret = tsk_table_collection_sort(&tables, &bookmark, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); /* Individuals must either all be sorted or all skipped */ ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add a parent relation that unsorts the table */ tables.individuals.parents[0] = 5; ret = tsk_table_collection_copy(&tables, ©, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memset(&bookmark, 0, sizeof(bookmark)); bookmark.individuals = tables.individuals.num_rows; ret = tsk_table_collection_sort(&tables, &bookmark, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tables, ©, 0)); /* Check that sorting would have had no effect as individuals not in default sort*/ ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tables, ©, 0)); /* Individual bookmark ignored */ tsk_memset(&bookmark, 0, sizeof(bookmark)); bookmark.individuals = tables.individuals.num_rows - 1; ret = tsk_table_collection_sort(&tables, &bookmark, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tables); tsk_table_collection_free(©); tsk_treeseq_free(ts); free(ts); } static void test_sort_tables_drops_indexes_with_options(tsk_flags_t tc_options) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, tc_options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_has_index(&tables, 0)) ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)) tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_sort_tables_drops_indexes(void) { test_sort_tables_drops_indexes_with_options(0); test_sort_tables_drops_indexes_with_options(TSK_TC_NO_EDGE_METADATA); } static void test_sort_tables_edge_metadata(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t t1, t2; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); insert_edge_metadata(&t1); ret = tsk_table_collection_copy(&t1, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); reverse_edges(&t1); CU_ASSERT_FALSE(tsk_table_collection_equals(&t1, &t2, 0)); ret = tsk_table_collection_sort(&t1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t1); tsk_table_collection_free(&t2); tsk_treeseq_free(&ts); } static void test_sort_tables_no_edge_metadata(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t t1, t2; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &t1, TSK_TC_NO_EDGE_METADATA); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(t1.edges.options & TSK_TABLE_NO_METADATA); ret = tsk_table_collection_copy(&t1, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(t2.edges.options & TSK_TABLE_NO_METADATA); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); reverse_edges(&t1); CU_ASSERT_FALSE(tsk_table_collection_equals(&t1, &t2, 0)); ret = tsk_table_collection_sort(&t1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); ret = tsk_table_collection_copy(&t1, &t2, TSK_TC_NO_EDGE_METADATA); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(t1.edges.options & TSK_TABLE_NO_METADATA); CU_ASSERT_TRUE(t2.edges.options & TSK_TABLE_NO_METADATA); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); reverse_edges(&t1); CU_ASSERT_FALSE(tsk_table_collection_equals(&t1, &t2, 0)); ret = tsk_table_collection_sort(&t1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); tsk_table_collection_free(&t1); tsk_treeseq_free(&ts); } static void test_sort_tables_errors(void) { int ret; tsk_id_t ret_id; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_bookmark_t pos; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memset(&pos, 0, sizeof(pos)); /* Everything 0 should be fine */ ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Everything is sorted already */ pos.edges = tables.edges.num_rows; ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); pos.edges = (tsk_size_t) -1; ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); pos.edges = tables.edges.num_rows + 1; ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); tsk_memset(&pos, 0, sizeof(pos)); pos.migrations = (tsk_size_t) -1; ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); pos.migrations = tables.migrations.num_rows + 1; ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); /* Node, population and provenance positions are ignored */ tsk_memset(&pos, 0, sizeof(pos)); pos.nodes = 1; ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memset(&pos, 0, sizeof(pos)); pos.populations = 1; ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memset(&pos, 0, sizeof(pos)); pos.provenances = 1; ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Specifying only one of sites or mutations is an error */ tsk_memset(&pos, 0, sizeof(pos)); pos.sites = 1; ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); tsk_memset(&pos, 0, sizeof(pos)); pos.mutations = 1; ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); /* Test TSK_ERR_MUTATION_PARENT_INCONSISTENT */ ret = tsk_table_collection_clear(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.0, "x", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "a", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 3, 0.0, "b", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 1, 0.0, "c", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "d", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_INCONSISTENT); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_sort_tables_mutation_times(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables, t1, t2; const char *sites = "0 0\n" "0.1 0\n" "0.2 0\n" "0.3 0\n"; const char *mutations = "0 0 1 -1 3\n" "1 1 1 -1 3\n" "2 4 1 -1 8\n" "2 1 0 -1 4\n" "2 2 1 -1 3\n" "2 1 1 -1 2\n" "3 6 1 -1 10\n"; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(single_tree_ex_nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); tables.nodes.time[4] = 6; tables.nodes.time[5] = 8; tables.nodes.time[6] = 10; parse_edges(single_tree_ex_edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); parse_sites(sites, &tables.sites); parse_mutations(mutations, &tables.mutations); CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 4); CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 7); tables.sequence_length = 1.0; ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Check to make sure we have legal mutations */ ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_COMPUTE_MUTATION_PARENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_copy_tables(&ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_copy(&t1, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); reverse_mutations(&t1); CU_ASSERT_FALSE(tsk_table_collection_equals(&t1, &t2, 0)); ret = tsk_table_collection_sort(&t1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); tsk_table_collection_free(&t1); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_sort_tables_mutations(void) { int ret; tsk_table_collection_t tables; /* Sorting hierarchy: * 1. site * 2. time (when known) * 3. node_time * 4. num_descendants: parent mutations first * 5. node_id * 6. mutation_id */ const char *sites = "0.0 A\n" "0.5 T\n" "0.75 G\n"; const char *mutations_unsorted = /* Test site criterion (primary) - site 1 should come after site 0 */ "1 0 X -1 0.0\n" /* mut 0: site 1, will be sorted after site 0 mutations */ "0 0 Y -1 0.0\n" /* mut 1: site 0, will be sorted before site 1 mutations */ /* Test time criterion - within same site, earlier time first */ "0 4 B -1 2.0\n" /* mut 2: site 0, node 4 (time 1.0), time 2.0 (later time) */ "0 5 A -1 2.5\n" /* mut 3: site 0, node 5 (time 2.0), time 2.5 (earlier relative) */ /* Test unknown vs known times - unknown times at site 2, fall back to node_time sorting */ "2 4 U2 -1\n" /* mut 4: site 2, node 4 (time 1.0), unknown time - falls back to node_time */ "2 4 U3 -1\n" /* mut 5: site 2, node 4 (time 1.0), unknown time - should use mutation_id as tiebreaker */ "2 5 U1 -1\n" /* mut 6: site 2, node 5 (time 2.0), unknown time - falls back to node_time */ /* Test node_time criterion - same site, same mut time, different node times */ "0 4 D -1 1.5\n" /* mut 7: site 0, node 4 (time 1.0), mut time 1.5 */ "0 5 C -1 2.5\n" /* mut 8: site 0, node 5 (time 2.0), mut time 2.5 - same mut time */ /* Test num_descendants criterion with mutation parent-child relationships */ "0 2 P -1 0.0\n" /* mut 9: site 0, node 2, parent mutation (0 descendants initially) */ "0 1 C1 9 0.0\n" /* mut 10: site 0, node 1, child of mut 9 (parent now has 1+ descendants) */ "0 1 C2 9 0.0\n" /* mut 11: site 0, node 1, another child of mut 9 (parent now has 2+ descendants) */ "0 3 Q -1 0.0\n" /* mut 12: site 0, node 3, no children (0 descendants) */ "0 0 C3 10 0.0\n" /* mut 13: site 0, node 0, child of mut 10 (making mut 9 a grandparent) */ /* Test node and mutation_id criteria for final tiebreaking */ "0 0 Z1 -1 0.0\n" /* mut 14: site 0, node 0, no parent, will test node+id ordering */ "0 0 Z2 -1 0.0\n"; /* mut 15: site 0, node 0, no parent, later in input = higher ID */ const char *mutations_sorted = /* Site 0 mutations - known times first, sorted by time */ "0 5 A -1 2.5\n" "0 5 C -1 2.5\n" "0 4 B -1 2.0\n" "0 4 D -1 1.5\n" "0 2 P -1 0.0\n" "0 1 C1 4 0.0\n" "0 0 Y -1 0.0\n" "0 0 C3 5 0.0\n" "0 0 Z1 -1 0.0\n" "0 0 Z2 -1 0.0\n" "0 1 C2 4 0.0\n" "0 3 Q -1 0.0\n" /* Site 1 mutations */ "1 0 X -1 0.0\n" /* Site 2 mutations - unknown times, sorted by node_time then other criteria */ "2 5 U1 -1\n" "2 4 U2 -1\n" "2 4 U3 -1\n"; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; parse_nodes(single_tree_ex_nodes, &tables.nodes); parse_edges(single_tree_ex_edges, &tables.edges); parse_sites(sites, &tables.sites); CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 3); parse_mutations(mutations_unsorted, &tables.mutations); CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 16); ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_t expected; ret = tsk_table_collection_init(&expected, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); expected.sequence_length = 1.0; parse_nodes(single_tree_ex_nodes, &expected.nodes); parse_edges(single_tree_ex_edges, &expected.edges); parse_sites(sites, &expected.sites); parse_mutations(mutations_sorted, &expected.mutations); CU_ASSERT_TRUE(tsk_mutation_table_equals(&tables.mutations, &expected.mutations, 0)); tsk_table_collection_free(&expected); tsk_table_collection_free(&tables); } static void test_sort_tables_canonical_errors(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; tsk_id_t null_p[] = { -1 }; tsk_id_t zero_p[] = { 0 }; tsk_id_t one_p[] = { 1 }; ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.0, "x", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "a", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 3, 0.0, "b", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 1, 0.0, "c", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "d", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_canonicalise(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_INCONSISTENT); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_FATAL(ret == 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "a", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 3, 0.0, "b", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 1, 0.0, "c", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, -1, 0.0, "d", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_canonicalise(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, one_p, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, zero_p, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_canonicalise(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_PARENT_CYCLE); ret = tsk_individual_table_clear(&tables.individuals); CU_ASSERT_FATAL(ret == 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, zero_p, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, zero_p, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_canonicalise(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_SELF_PARENT); ret = tsk_individual_table_clear(&tables.individuals); CU_ASSERT_FATAL(ret == 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, null_p, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, zero_p, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_canonicalise(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tables); } static void test_sort_tables_canonical(void) { int ret; tsk_table_collection_t t1, t2; // this is single_tree_ex with individuals and populations const char *nodes = "1 0 -1 1\n" "1 0 2 3\n" "1 0 0 -1\n" "1 0 -1 3\n" "0 1 2 -1\n" "0 2 -1 2\n" "0 3 -1 -1\n"; const char *individuals = "0 0.0 1\n" "0 1.0 -1\n" "0 2.0 1,3\n" "0 3.0 -1,1\n"; const char *sites = "0 0\n" "0.2 0\n" "0.1 0\n"; const char *mutations = "0 0 2 3 0.5\n" "2 1 1 -1 0.5\n" "1 4 3 -1 3\n" "0 4 1 -1 2.5\n" "2 2 1 -1 2\n" "1 1 5 7 0.5\n" "1 2 1 -1 2\n" "1 1 4 2 0.5\n" "1 1 6 7 0.5\n"; const char *nodes_sorted = "1 0 -1 0\n" "1 0 0 1\n" "1 0 1 -1\n" "1 0 -1 1\n" "0 1 0 -1\n" "0 2 -1 2\n" "0 3 -1 -1\n"; const char *individuals_sorted = "0 1.0 -1\n" "0 3.0 -1,0\n" "0 2.0 0,1\n"; const char *sites_sorted = "0 0\n" "0.1 0\n" "0.2 0\n"; const char *mutations_sorted = "0 4 1 -1 2.5\n" "0 0 2 0 0.5\n" "1 2 1 -1 2\n" "1 1 1 -1 0.5\n" "2 4 3 -1 3\n" "2 2 1 -1 2\n" "2 1 4 4 0.5\n" "2 1 5 6 0.5\n" "2 1 6 6 0.5\n"; const char *individuals_sorted_kept = "0 1.0 -1\n" "0 3.0 -1,0\n" "0 2.0 0,1\n" "0 0.0 0\n"; ret = tsk_table_collection_init(&t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); t1.sequence_length = 1.0; ret = tsk_table_collection_init(&t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); t2.sequence_length = 1.0; parse_nodes(nodes, &t1.nodes); CU_ASSERT_EQUAL_FATAL(t1.nodes.num_rows, 7); parse_individuals(individuals, &t1.individuals); CU_ASSERT_EQUAL_FATAL(t1.individuals.num_rows, 4); tsk_population_table_add_row(&t1.populations, "A", 1); tsk_population_table_add_row(&t1.populations, "B", 1); tsk_population_table_add_row(&t1.populations, "C", 1); parse_edges(single_tree_ex_edges, &t1.edges); CU_ASSERT_EQUAL_FATAL(t1.edges.num_rows, 6); parse_sites(sites, &t1.sites); CU_ASSERT_EQUAL_FATAL(t1.sites.num_rows, 3); parse_mutations(mutations, &t1.mutations); CU_ASSERT_EQUAL_FATAL(t1.mutations.num_rows, 9); ret = tsk_table_collection_canonicalise(&t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); parse_nodes(nodes_sorted, &t2.nodes); tsk_population_table_add_row(&t2.populations, "C", 1); tsk_population_table_add_row(&t2.populations, "A", 1); CU_ASSERT_EQUAL_FATAL(t2.nodes.num_rows, 7); parse_individuals(individuals_sorted, &t2.individuals); CU_ASSERT_EQUAL_FATAL(t2.individuals.num_rows, 3); parse_edges(single_tree_ex_edges, &t2.edges); CU_ASSERT_EQUAL_FATAL(t2.edges.num_rows, 6); parse_sites(sites_sorted, &t2.sites); parse_mutations(mutations_sorted, &t2.mutations); CU_ASSERT_EQUAL_FATAL(t2.sites.num_rows, 3); CU_ASSERT_EQUAL_FATAL(t2.mutations.num_rows, 9); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); ret = tsk_table_collection_clear(&t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_clear(&t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // now with KEEP_UNREFERENCED parse_nodes(nodes, &t1.nodes); parse_individuals(individuals, &t1.individuals); tsk_population_table_add_row(&t1.populations, "A", 1); tsk_population_table_add_row(&t1.populations, "B", 1); tsk_population_table_add_row(&t1.populations, "C", 1); parse_edges(single_tree_ex_edges, &t1.edges); parse_sites(sites, &t1.sites); parse_mutations(mutations, &t1.mutations); ret = tsk_table_collection_canonicalise(&t1, TSK_SUBSET_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); parse_nodes(nodes_sorted, &t2.nodes); tsk_population_table_add_row(&t2.populations, "C", 1); tsk_population_table_add_row(&t2.populations, "A", 1); tsk_population_table_add_row(&t2.populations, "B", 1); parse_individuals(individuals_sorted_kept, &t2.individuals); CU_ASSERT_EQUAL_FATAL(t2.individuals.num_rows, 4); parse_edges(single_tree_ex_edges, &t2.edges); parse_sites(sites_sorted, &t2.sites); parse_mutations(mutations_sorted, &t2.mutations); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t2); tsk_table_collection_free(&t1); } static void test_sort_tables_migrations(void) { int ret; tsk_treeseq_t *ts; tsk_table_collection_t tables, copy; ts = caterpillar_tree(13, 1, 1); ret = tsk_treeseq_copy_tables(ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tables.migrations.num_rows > 0); ret = tsk_table_collection_copy(&tables, ©, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); reverse_migrations(&tables); CU_ASSERT_FATAL(!tsk_table_collection_equals(&tables, ©, 0)); ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_migration_table_equals(&tables.migrations, ©.migrations, 0)); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); /* Make sure we test the deeper comparison keys. The full key is * (time, source, dest, left, node) */ tsk_migration_table_clear(&tables.migrations); /* params = left, right, node, source, dest, time */ tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 1, 0, NULL, 0); tsk_migration_table_add_row(&tables.migrations, 0, 1, 1, 0, 1, 0, NULL, 0); ret = tsk_migration_table_copy(&tables.migrations, ©.migrations, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); reverse_migrations(&tables); CU_ASSERT_FATAL(!tsk_table_collection_equals(&tables, ©, 0)); ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_migration_table_equals(&tables.migrations, ©.migrations, 0)); tsk_table_collection_free(&tables); tsk_table_collection_free(©); tsk_treeseq_free(ts); free(ts); } static void test_sort_tables_individuals(void) { int ret; tsk_table_collection_t tables, copy; const char *individuals = "1 0.25 2,3 0\n" "2 0.5 5,-1 1\n" "3 0.3 -1 2\n" "4 0.3 -1 3\n" "5 0.3 3 4\n" "6 0.3 4 5\n"; const char *individuals_cycle = "1 0.2 2 0\n" "2 0.5 0 1\n" "3 0.3 1 2\n"; const tsk_id_t bad_parents[] = { 200 }; tsk_id_t ret_id; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; parse_individuals(individuals, &tables.individuals); ret = tsk_table_collection_copy(&tables, ©, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Table sort doesn't touch individuals by default*/ ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); /* Not calling with TSK_CHECK_TREES so casting is safe */ ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_INDIVIDUAL_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_INDIVIDUALS); ret = tsk_table_collection_individual_topological_sort(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_INDIVIDUAL_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Check that the sort is stable */ tsk_table_collection_free(©); ret = tsk_table_collection_copy(&tables, ©, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_individual_topological_sort(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); /* Errors on bad table */ ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, bad_parents, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 6); ret = tsk_table_collection_individual_topological_sort(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); /* Errors on cycle */ tsk_individual_table_clear(&tables.individuals); parse_individuals(individuals_cycle, &tables.individuals); ret = tsk_table_collection_individual_topological_sort(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_PARENT_CYCLE); tsk_table_collection_free(&tables); tsk_table_collection_free(©); } static void test_sorter_interface(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_table_sorter_t sorter; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, &tables, 0)); /* Nominal case */ reverse_edges(&tables); CU_ASSERT_FALSE(tsk_table_collection_equals(ts.tables, &tables, 0)); ret = tsk_table_sorter_init(&sorter, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_sorter_run(&sorter, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, &tables, 0)); CU_ASSERT_EQUAL(sorter.user_data, NULL); tsk_table_sorter_free(&sorter); /* If we set the sort_edges function to NULL then we should leave the * node table as is. */ reverse_edges(&tables); CU_ASSERT_FALSE(tsk_edge_table_equals(&ts.tables->edges, &tables.edges, 0)); ret = tsk_table_sorter_init(&sorter, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); sorter.sort_edges = NULL; ret = tsk_table_sorter_run(&sorter, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_edge_table_equals(&ts.tables->edges, &tables.edges, 0)); tsk_table_sorter_free(&sorter); /* Reversing again should make them equal */ reverse_edges(&tables); CU_ASSERT_TRUE(tsk_edge_table_equals(&ts.tables->edges, &tables.edges, 0)); /* Do not check integrity before sorting */ reverse_edges(&tables); CU_ASSERT_FALSE(tsk_table_collection_equals(ts.tables, &tables, 0)); ret = tsk_table_sorter_init(&sorter, &tables, TSK_NO_CHECK_INTEGRITY); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_sorter_run(&sorter, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, &tables, 0)); tsk_table_sorter_free(&sorter); /* The user_data shouldn't be touched */ reverse_edges(&tables); CU_ASSERT_FALSE(tsk_table_collection_equals(ts.tables, &tables, 0)); ret = tsk_table_sorter_init(&sorter, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); sorter.user_data = (void *) &ts; ret = tsk_table_sorter_run(&sorter, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, &tables, 0)); CU_ASSERT_EQUAL_FATAL(sorter.user_data, &ts); tsk_table_sorter_free(&sorter); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_dump_unindexed_with_options(tsk_flags_t tc_options) { tsk_table_collection_t tables, loaded; int ret; ret = tsk_table_collection_init(&tables, tc_options); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(single_tree_ex_nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); parse_edges(single_tree_ex_edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)); ret = tsk_table_collection_dump(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)); ret = tsk_table_collection_load(&loaded, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_has_index(&loaded, 0)); CU_ASSERT_TRUE(tsk_node_table_equals(&tables.nodes, &loaded.nodes, 0)); CU_ASSERT_TRUE(tsk_edge_table_equals(&tables.edges, &loaded.edges, 0)); tsk_table_collection_free(&loaded); tsk_table_collection_free(&tables); } static void test_dump_unindexed(void) { test_dump_unindexed_with_options(0); test_dump_unindexed_with_options(TSK_TC_NO_EDGE_METADATA); } static void test_dump_load_empty_with_options(tsk_flags_t tc_options) { int ret; tsk_table_collection_t t1, t2; ret = tsk_table_collection_init(&t1, tc_options); CU_ASSERT_EQUAL_FATAL(ret, 0); t1.sequence_length = 1.0; ret = tsk_table_collection_dump(&t1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t1); tsk_table_collection_free(&t2); } static void test_dump_load_empty(void) { test_dump_load_empty_with_options(0); test_dump_load_empty_with_options(TSK_TC_NO_EDGE_METADATA); } static void test_dump_load_unsorted_with_options(tsk_flags_t tc_options) { int ret; tsk_id_t ret_id; tsk_table_collection_t t1, t2; ret = tsk_table_collection_init(&t1, tc_options); CU_ASSERT_EQUAL_FATAL(ret, 0); t1.sequence_length = 1.0; ret_id = tsk_node_table_add_row( &t1.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row( &t1.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_node_table_add_row( &t1.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 2); ret_id = tsk_node_table_add_row( &t1.nodes, TSK_NODE_IS_SAMPLE, 1, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 3); ret_id = tsk_node_table_add_row( &t1.nodes, TSK_NODE_IS_SAMPLE, 2, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 4); ret_id = tsk_edge_table_add_row(&t1.edges, 0, 1, 3, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_edge_table_add_row(&t1.edges, 0, 1, 4, 3, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_edge_table_add_row(&t1.edges, 0, 1, 3, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 2); ret_id = tsk_edge_table_add_row(&t1.edges, 0, 1, 4, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 3); /* Verify that it's unsorted */ ret = (int) tsk_table_collection_check_integrity(&t1, TSK_CHECK_EDGE_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME); ret = tsk_table_collection_dump(&t1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_has_index(&t1, 0)); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); CU_ASSERT_FALSE(tsk_table_collection_has_index(&t1, 0)); CU_ASSERT_FALSE(tsk_table_collection_has_index(&t2, 0)); tsk_table_collection_free(&t1); tsk_table_collection_free(&t2); } static void test_dump_load_unsorted(void) { test_dump_load_unsorted_with_options(0); test_dump_load_unsorted_with_options(TSK_TC_NO_EDGE_METADATA); } static void test_dump_load_metadata_schema(void) { int ret; tsk_table_collection_t t1, t2; ret = tsk_table_collection_init(&t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); t1.sequence_length = 1.0; char example[100] = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_length = (tsk_size_t) strlen(example) + 4; tsk_node_table_set_metadata_schema( &t1.nodes, strcat(example, "node"), example_length); tsk_edge_table_set_metadata_schema( &t1.edges, strcat(example, "edge"), example_length); tsk_site_table_set_metadata_schema( &t1.sites, strcat(example, "site"), example_length); tsk_mutation_table_set_metadata_schema( &t1.mutations, strcat(example, "muta"), example_length); tsk_migration_table_set_metadata_schema( &t1.migrations, strcat(example, "migr"), example_length); tsk_individual_table_set_metadata_schema( &t1.individuals, strcat(example, "indi"), example_length); tsk_population_table_set_metadata_schema( &t1.populations, strcat(example, "popu"), example_length); ret = tsk_table_collection_dump(&t1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&t2, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t1); tsk_table_collection_free(&t2); } static void test_dump_fail_no_file(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t t1; ret = tsk_table_collection_init(&t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); t1.sequence_length = 1.0; ret_id = tsk_node_table_add_row( &t1.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row( &t1.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_node_table_add_row( &t1.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 2); ret_id = tsk_node_table_add_row( &t1.nodes, TSK_NODE_IS_SAMPLE, 1, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 3); ret_id = tsk_node_table_add_row( &t1.nodes, TSK_NODE_IS_SAMPLE, 2, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 4); ret_id = tsk_edge_table_add_row(&t1.edges, 0, 1, 3, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_edge_table_add_row(&t1.edges, 0, 1, 4, 3, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_edge_table_add_row(&t1.edges, 0, 1, 3, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 2); ret_id = tsk_edge_table_add_row(&t1.edges, 0, 1, 4, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 3); /* Verify that it's unsorted */ ret = (int) tsk_table_collection_check_integrity(&t1, TSK_CHECK_EDGE_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME); /* Make sure the file doesn't exist beforehand. */ unlink(_tmp_file_name); errno = 0; CU_ASSERT_EQUAL(access(_tmp_file_name, F_OK), -1); tsk_table_collection_free(&t1); } static void test_load_reindex(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_dump(&ts, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_drop_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_has_index(&tables, 0)); ret = tsk_table_collection_drop_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Dump the unindexed version */ ret = tsk_table_collection_dump(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_free(&tables); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_has_index(&tables, 0)); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_table_overflow(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_size_t max_rows = ((tsk_size_t) TSK_MAX_ID); ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Simulate overflows */ tables.individuals.max_rows = max_rows; tables.individuals.num_rows = max_rows; ret_id = tsk_individual_table_add_row(&tables.individuals, 0, 0, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLE_OVERFLOW); tables.nodes.max_rows = max_rows; tables.nodes.num_rows = max_rows; ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLE_OVERFLOW); tables.edges.max_rows = max_rows; tables.edges.num_rows = max_rows; ret_id = tsk_edge_table_add_row(&tables.edges, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLE_OVERFLOW); tables.migrations.max_rows = max_rows; tables.migrations.num_rows = max_rows; ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 0, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLE_OVERFLOW); tables.sites.max_rows = max_rows; tables.sites.num_rows = max_rows; ret_id = tsk_site_table_add_row(&tables.sites, 0, 0, 0, 0, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLE_OVERFLOW); tables.mutations.max_rows = max_rows; tables.mutations.num_rows = max_rows; ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, 0, 0, 0, 0, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLE_OVERFLOW); tables.provenances.max_rows = max_rows; tables.provenances.num_rows = max_rows; ret_id = tsk_provenance_table_add_row(&tables.provenances, 0, 0, 0, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLE_OVERFLOW); tables.populations.max_rows = max_rows; tables.populations.num_rows = max_rows; ret_id = tsk_population_table_add_row(&tables.populations, 0, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLE_OVERFLOW); tsk_table_collection_free(&tables); } static void test_column_overflow(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_size_t too_big = TSK_MAX_SIZE; double zero = 0; char zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0 }; tsk_id_t id_zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0 }; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // location /* We can't trigger a column overflow with one element because the parameter * value is 32 bit */ ret_id = tsk_individual_table_add_row( &tables.individuals, 0, &zero, 1, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); // Check normal overflow from additional length ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, too_big, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); // Check overflow from minimum increment ret = tsk_individual_table_set_max_location_length_increment( &tables.individuals, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 1, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); // parents ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, id_zeros, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, too_big, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_individual_table_set_max_parents_length_increment( &tables.individuals, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); // metadata ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, zeros, 1); CU_ASSERT_EQUAL_FATAL(ret_id, 2); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, too_big); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_individual_table_set_max_metadata_length_increment( &tables.individuals, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 1); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, 0, 0, zeros, 1); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, 0, 0, NULL, too_big); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_node_table_set_max_metadata_length_increment(&tables.nodes, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, 0, 0, NULL, 1); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret_id = tsk_edge_table_add_row(&tables.edges, 0, 0, 0, 0, zeros, 1); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0, 0, 0, 0, NULL, too_big); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_edge_table_set_max_metadata_length_increment(&tables.edges, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0, 0, 0, 0, NULL, 1); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret_id = tsk_site_table_add_row(&tables.sites, 0, zeros, 1, zeros, 1); CU_ASSERT_EQUAL_FATAL(ret_id, 0); // ancestral state ret_id = tsk_site_table_add_row(&tables.sites, 0, NULL, too_big, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_site_table_set_max_ancestral_state_length_increment( &tables.sites, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_site_table_add_row(&tables.sites, 0, NULL, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); // metadata ret_id = tsk_site_table_add_row(&tables.sites, 0, NULL, 0, NULL, too_big); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_site_table_set_max_metadata_length_increment(&tables.sites, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_site_table_add_row(&tables.sites, 0, NULL, 0, NULL, 1); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, 0, zeros, 1, zeros, 1); CU_ASSERT_EQUAL_FATAL(ret_id, 0); // derived state ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, 0, 0, NULL, too_big, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_mutation_table_set_max_derived_state_length_increment( &tables.mutations, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, 0, NULL, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); // metadata ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, 0, 0, NULL, 0, NULL, too_big); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_mutation_table_set_max_metadata_length_increment( &tables.mutations, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, 0, NULL, 0, NULL, 1); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret_id = tsk_provenance_table_add_row(&tables.provenances, zeros, 1, zeros, 1); CU_ASSERT_EQUAL_FATAL(ret_id, 0) // timestamp ret_id = tsk_provenance_table_add_row(&tables.provenances, NULL, too_big, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_provenance_table_set_max_timestamp_length_increment( &tables.provenances, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_provenance_table_add_row(&tables.provenances, NULL, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); // record ret_id = tsk_provenance_table_add_row(&tables.provenances, NULL, 0, NULL, too_big); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_provenance_table_set_max_record_length_increment( &tables.provenances, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_provenance_table_add_row(&tables.provenances, NULL, 0, NULL, 1); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret_id = tsk_population_table_add_row(&tables.populations, zeros, 1); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, too_big); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_population_table_set_max_metadata_length_increment( &tables.populations, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 1); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 0, 0, 0, 0, 0, zeros, 1); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0, 0, 0, 0, 0, 0, NULL, too_big); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_migration_table_set_max_metadata_length_increment( &tables.migrations, too_big); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 0, 0, 0, 0, 0, NULL, 1); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_COLUMN_OVERFLOW); tsk_table_collection_free(&tables); } static void test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; const char *individuals = "1 0.25 -1\n" "2 0.5,0.25 2\n" "3 0.5,0.25 0\n"; ret = tsk_table_collection_init(&tables, tc_options); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; /* nodes */ ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, INFINITY, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /* Not calling with TSK_CHECK_TREES so casting is safe */ ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_NONFINITE); ret = tsk_node_table_clear(&tables.nodes); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, ret_id); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_NO_CHECK_POPULATION_REFS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = tsk_node_table_clear(&tables.nodes); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, ret_id); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); ret = tsk_node_table_clear(&tables.nodes); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 1.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* edges */ ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, TSK_NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NULL_PARENT); ret = tsk_edge_table_clear(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 2, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_edge_table_clear(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NULL_CHILD); ret = tsk_edge_table_clear(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_edge_table_clear(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, INFINITY, 1, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_GENOME_COORDS_NONFINITE); ret = tsk_edge_table_clear(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&tables.edges, -1.0, 1.0, 1, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_LEFT_LESS_ZERO); ret = tsk_edge_table_clear(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.1, 1, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_RIGHT_GREATER_SEQ_LENGTH); ret = tsk_edge_table_clear(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.5, 0.1, 1, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_EDGE_INTERVAL); ret = tsk_edge_table_clear(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 0.5, 0, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_TIME_ORDERING); ret = tsk_edge_table_clear(&tables.edges); CU_ASSERT_EQUAL_FATAL(ret, 0); /* sites */ ret_id = tsk_site_table_add_row(&tables.sites, INFINITY, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SITE_POSITION); ret = tsk_site_table_clear(&tables.sites); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_site_table_add_row(&tables.sites, -0.5, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SITE_POSITION); ret = tsk_site_table_clear(&tables.sites); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_site_table_add_row(&tables.sites, 1.5, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SITE_POSITION); ret = tsk_site_table_clear(&tables.sites); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.5, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.5, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, TSK_CHECK_SITE_DUPLICATES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SITE_POSITION); ret = tsk_site_table_clear(&tables.sites); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.5, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.4, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, TSK_CHECK_SITE_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_SITES); ret = tsk_site_table_clear(&tables.sites); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.5, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.6, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); /* mutations */ ret_id = tsk_mutation_table_add_row( &tables.mutations, 2, 0, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 2, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); /* A mixture of known and unknown times on a site fails */ ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN); /* But on different sites, passes */ ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 1, 0, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 1, 2, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 1, 0, 1.0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_EQUAL); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 1, 1, 1.0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 1, TSK_NULL, 1.0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_AFTER_CHILD); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 1, 1, 0, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 1, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_MUTATIONS); /* Unknown times pass */ ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Correctly ordered times pass */ ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, 1, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, 1, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Incorrectly ordered times fail */ ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, 1, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_MUTATIONS); /* Putting incorrectly ordered times on diff sites passes */ ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, 1, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 1, 0, TSK_NULL, 2, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 1, 0, TSK_NULL, 1, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, NAN, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_NONFINITE); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, INFINITY, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_NONFINITE); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 1, 1, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 1, 1, TSK_NULL, 1, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row(&tables.mutations, 1, 1, 0, 2, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, 0); /* migrations */ ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0.0, 0.5, 2, 0, 1, 1.5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0.0, 0.5, 1, 2, 1, 1.5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0.0, 0.5, 1, 0, 2, 1.5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0.0, 0.5, 1, 0, 1, INFINITY, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_NONFINITE); ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0.0, 0.5, 1, 0, 1, 1.5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0.0, 0.5, 1, 1, 0, 0.5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_MIGRATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_MIGRATIONS); ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0.0, INFINITY, 1, 0, 1, 1.5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_GENOME_COORDS_NONFINITE); ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, -0.3, 0.5, 1, 0, 1, 1.5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_LEFT_LESS_ZERO); ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0.0, 1.5, 1, 0, 1, 1.5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_RIGHT_GREATER_SEQ_LENGTH); ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0.6, 0.5, 1, 0, 1, 1.5, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_EDGE_INTERVAL); ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); parse_individuals(individuals, &tables.individuals); CU_ASSERT_EQUAL_FATAL(tables.individuals.num_rows, 3); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_INDIVIDUAL_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_INDIVIDUALS); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Check that an individual can't be its own parent */ tables.individuals.parents[0] = 0; tables.individuals.parents[1] = 1; tables.individuals.parents[2] = 2; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_SELF_PARENT); tables.individuals.parents[0] = -2; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); tsk_table_collection_free(&tables); } static void test_table_collection_check_integrity_no_populations(void) { int ret; tsk_id_t ret_id; tsk_id_t ret_num_trees; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add in some bad population references and check that we can use * TSK_NO_CHECK_POPULATION_REFS with TSK_CHECK_TREES */ tables.nodes.population[0] = 10; /* Not calling with TSK_CHECK_TREES so casting is safe */ ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret_num_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_num_trees, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_NO_CHECK_POPULATION_REFS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_num_trees = tsk_table_collection_check_integrity( &tables, TSK_CHECK_TREES | TSK_NO_CHECK_POPULATION_REFS); /* CHECK_TREES returns the number of trees */ CU_ASSERT_EQUAL_FATAL(ret_num_trees, 3); tables.nodes.population[0] = TSK_NULL; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_migration_table_add_row( &tables.migrations, 0.4, 0.5, 1, 0, 1, 1.5, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret_num_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_num_trees, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_NO_CHECK_POPULATION_REFS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_num_trees = tsk_table_collection_check_integrity( &tables, TSK_CHECK_TREES | TSK_NO_CHECK_POPULATION_REFS); CU_ASSERT_EQUAL_FATAL(ret_num_trees, 3); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_table_collection_check_integrity(void) { test_table_collection_check_integrity_with_options(0); test_table_collection_check_integrity_with_options(TSK_TC_NO_EDGE_METADATA); } static void test_table_collection_check_integrity_bad_indexes_example(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; /* We start with a concrete example where you can get bad trees * by building some valid tables, clearing the edges, and then * building new ones without rebuilding the indexes. */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 5; /* nodes */ ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 1.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 2); /* edges */ ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 5.0, 2, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 5.0, 2, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); /* build index */ ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* okay now build a new table without rebuilding the indexes */ tsk_edge_table_clear(&tables.edges); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 2, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /* make sure we don't use too-long indexes */ ret_id = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLES_NOT_INDEXED); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 4.0, 2, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); /* should error, as tree sequence will be wrong */ ret_id = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLES_BAD_INDEXES); tsk_table_collection_free(&tables); } static void test_table_collection_check_integrity_bad_indexes(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; /* Now hit some other weird cases by manipulating the indexes directly */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 5; /* nodes */ ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 1.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 2); /* edges */ ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 2, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 1.0, 2.0, 2, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_edge_table_add_row(&tables.edges, 2.0, 5.0, 2, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 2); ret_id = tsk_edge_table_add_row(&tables.edges, 1.0, 3.0, 2, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 3); /* build index */ ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT(ret_id > 0); /* edge removed before it is added */ ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.indexes.edge_insertion_order[0] = 1; tables.indexes.edge_insertion_order[2] = 0; ret_id = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLES_BAD_INDEXES); /* edge added twice (implies another is never added) */ ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.indexes.edge_insertion_order[0] = 0; tables.indexes.edge_insertion_order[1] = 0; tables.indexes.edge_removal_order[0] = 1; tables.indexes.edge_removal_order[2] = 2; ret_id = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLES_BAD_INDEXES); /* edge never removed but should have been */ ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.indexes.edge_removal_order[0] = 0; tables.indexes.edge_removal_order[1] = 1; tables.indexes.edge_removal_order[2] = 2; tables.indexes.edge_removal_order[3] = 3; ret_id = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLES_BAD_INDEXES); /* edge progression out of order */ tables.edges.right[2] = 4.0; ret_id = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLES_BAD_INDEXES); /* edge never used */ ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.indexes.edge_insertion_order[0] = 0; tables.indexes.edge_insertion_order[1] = 3; tables.indexes.edge_insertion_order[2] = 0; tables.indexes.edge_insertion_order[3] = 3; tables.indexes.edge_removal_order[0] = 0; tables.indexes.edge_removal_order[1] = 3; tables.indexes.edge_removal_order[2] = 0; tables.indexes.edge_removal_order[3] = 3; ret_id = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLES_BAD_INDEXES); /* make sure we don't use the too-short indexes */ ret_id = tsk_edge_table_add_row(&tables.edges, 4.0, 5.0, 2, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 4); ret_id = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_TABLES_NOT_INDEXED); tsk_table_collection_free(&tables); } static void test_check_integrity_bad_mutation_parent_topology(void) { int ret; tsk_id_t ret_trees; tsk_table_collection_t tables; const char *sites = "0 0\n"; /* Make a mutation on a parallel branch the parent*/ const char *bad_mutations = "0 0 1 -1\n" "0 1 1 0\n"; /* A mutation above is set as child*/ const char *reverse_mutations = "0 0 1 -1\n" "0 4 1 0\n"; const char *reverse_sites = "0.5 0\n" "0 0\n"; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(single_tree_ex_nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); parse_edges(single_tree_ex_edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); parse_sites(sites, &tables.sites); CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 1); parse_mutations(bad_mutations, &tables.mutations); CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 2); tables.sequence_length = 1.0; ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_trees, 1); ret_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_PARENTS); CU_ASSERT_EQUAL_FATAL(ret_trees, TSK_ERR_BAD_MUTATION_PARENT); parse_mutations(reverse_mutations, &tables.mutations); ret_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_trees, 1); ret_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_PARENTS); CU_ASSERT_EQUAL_FATAL(ret_trees, TSK_ERR_MUTATION_PARENT_AFTER_CHILD); /* Now check that TSK_CHECK_MUTATION_PARENTS implies TSK_CHECK_TREES by triggering an error with reversed sites */ parse_sites(reverse_sites, &tables.sites); ret_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_PARENTS); CU_ASSERT_EQUAL_FATAL(ret_trees, TSK_ERR_UNSORTED_SITES); tsk_table_collection_free(&tables); } static void test_table_collection_compute_mutation_parents_tolerates_invalid_input(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_id_t site; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 0, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); site = tsk_site_table_add_row(&tables.sites, 0.0, "A", 1, NULL, 0); CU_ASSERT_FATAL(site >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, site, 1, TSK_NULL, TSK_UNKNOWN_TIME, "C", 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.mutations.parent[0] = 42; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tables.mutations.parent[0] == TSK_NULL); tsk_table_collection_free(&tables); } static void test_table_collection_compute_mutation_parents_restores_on_error(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_id_t site; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 0, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); site = tsk_site_table_add_row(&tables.sites, 0.5, "A", 1, NULL, 0); CU_ASSERT_FATAL(site >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, site, 1, TSK_NULL, TSK_UNKNOWN_TIME, "C", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, site, 0, TSK_NULL, TSK_UNKNOWN_TIME, "G", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.mutations.parent[0] = 111; tables.mutations.parent[1] = 222; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_AFTER_CHILD); CU_ASSERT_EQUAL(tables.mutations.parent[0], 111); CU_ASSERT_EQUAL(tables.mutations.parent[1], 222); tsk_table_collection_free(&tables); } static void test_table_collection_subset_with_options(tsk_flags_t options) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_table_collection_t tables_copy; int k; tsk_id_t nodes[4]; tsk_id_t zero_p[] = { 0 }; tsk_id_t one_p[] = { 1 }; ret = tsk_table_collection_init(&tables, options); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret = tsk_table_collection_init(&tables_copy, options); CU_ASSERT_EQUAL_FATAL(ret, 0); // does not error on empty tables ret = tsk_table_collection_subset(&tables, NULL, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // four nodes from two diploids; the first is from pop 0 ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 1.0, 0, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 2.0, TSK_NULL, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); // unused individual who is the parent of others ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, zero_p, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, one_p, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); // unused individual ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, one_p, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); // unused population ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 2, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.2, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.4, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); // unused site ret_id = tsk_site_table_add_row(&tables.sites, 0.5, "C", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, 0, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 1, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); // empty nodes should get empty tables ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset(&tables_copy, NULL, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); // unless NO_CHANGE_POPULATIONS is provided ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset( &tables_copy, NULL, 0, TSK_SUBSET_NO_CHANGE_POPULATIONS); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); CU_ASSERT_FATAL( tsk_population_table_equals(&tables.populations, &tables_copy.populations, 0)); // or KEEP_UNREFERENCED ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset( &tables_copy, NULL, 0, TSK_SUBSET_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); CU_ASSERT_FATAL( tsk_individual_table_equals(&tables.individuals, &tables_copy.individuals, 0)); CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 2); CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); CU_ASSERT_FATAL(tsk_site_table_equals(&tables.sites, &tables_copy.sites, 0)); // or both ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset(&tables_copy, NULL, 0, TSK_SUBSET_KEEP_UNREFERENCED | TSK_SUBSET_NO_CHANGE_POPULATIONS); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); CU_ASSERT_FATAL( tsk_individual_table_equals(&tables.individuals, &tables_copy.individuals, 0)); CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); CU_ASSERT_FATAL( tsk_population_table_equals(&tables.populations, &tables_copy.populations, 0)); CU_ASSERT_FATAL(tsk_site_table_equals(&tables.sites, &tables_copy.sites, 0)); // the identity transformation, since unused pops are at the end for (k = 0; k < 4; k++) { nodes[k] = k; } ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset( &tables_copy, nodes, 4, TSK_SUBSET_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); // or, remove unused things: ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset(&tables_copy, nodes, 4, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables_copy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_node_table_equals(&tables.nodes, &tables_copy.nodes, 0)); CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 2); CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 1); CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, 2); CU_ASSERT_FATAL( tsk_mutation_table_equals(&tables.mutations, &tables_copy.mutations, 0)); // reverse twice should get back to the start, since unused pops are at the end for (k = 0; k < 4; k++) { nodes[k] = 3 - k; } ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset( &tables_copy, nodes, 4, TSK_SUBSET_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset( &tables_copy, nodes, 4, TSK_SUBSET_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables_copy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); tsk_table_collection_free(&tables_copy); tsk_table_collection_free(&tables); } static void test_table_collection_subset(void) { test_table_collection_subset_with_options(0); test_table_collection_subset_with_options(TSK_TC_NO_EDGE_METADATA); } static void test_table_collection_subset_unsorted(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_table_collection_t tables_copy; int k; tsk_id_t nodes[3]; tsk_id_t one_p[] = { 1 }; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret = tsk_table_collection_init(&tables_copy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // these tables are a big mess ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.5, TSK_NULL, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.0, TSK_NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, one_p, 1, NULL, 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 0.5, 2, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.5, 1.0, 2, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.2, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.4, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, 2, TSK_UNKNOWN_TIME, "B", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 1, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); // but still, this should leave them unchanged for (k = 0; k < 3; k++) { nodes[k] = k; } ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset( &tables_copy, nodes, 3, TSK_SUBSET_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); tsk_table_collection_free(&tables_copy); tsk_table_collection_free(&tables); } static void test_table_collection_subset_errors(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_table_collection_t tables_copy; tsk_id_t nodes[4] = { 0, 1, 2, 3 }; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret = tsk_table_collection_init(&tables_copy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // four nodes from two diploids; the first is from pop 0 ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 1.0, 0, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 2.0, TSK_NULL, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Migrations are not supported */ ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_migration_table_add_row(&tables_copy.migrations, 0, 1, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.migrations.num_rows, 1); ret = tsk_table_collection_subset(&tables_copy, nodes, 4, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED); // test out of bounds nodes ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); nodes[0] = -1; ret = tsk_table_collection_subset(&tables_copy, nodes, 4, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); nodes[0] = 6; ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset(&tables_copy, nodes, 4, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); // check integrity nodes[0] = 0; nodes[1] = 1; ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_truncate(&tables_copy.nodes, 3); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row( &tables_copy.nodes, TSK_NODE_IS_SAMPLE, 0.0, -2, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_subset(&tables_copy, nodes, 4, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tsk_table_collection_free(&tables); tsk_table_collection_free(&tables_copy); } static void test_table_collection_union(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_table_collection_t tables_empty; tsk_table_collection_t tables_copy; tsk_id_t node_mapping[3]; tsk_id_t parents[2] = { -1, -1 }; char example_metadata[100] = "An example of metadata with unicode 🎄🌳🌴🌲🎋"; tsk_size_t example_metadata_length = (tsk_size_t) strlen(example_metadata); tsk_memset(node_mapping, 0xff, sizeof(node_mapping)); ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret = tsk_table_collection_init(&tables_empty, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables_empty.sequence_length = 1; ret = tsk_table_collection_init(&tables_copy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // does not error on empty tables ret = tsk_table_collection_union(&tables, &tables_empty, node_mapping, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // does not error on empty tables but that differ on top level metadata ret = tsk_table_collection_set_metadata( &tables, example_metadata, example_metadata_length); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_union(&tables, &tables_empty, node_mapping, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // three nodes, two pop, three ind, two edge, two site, two mut ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 1, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.5, 1, 2, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, parents, 2, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); parents[0] = 0; ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, parents, 2, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); parents[1] = 1; ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, parents, 2, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 2, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 2, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.4, "T", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.2, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 1, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // union with empty should not change // other is empty ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_union( &tables_copy, &tables_empty, node_mapping, TSK_UNION_NO_CHECK_SHARED); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); // self is empty ret = tsk_table_collection_clear(&tables_copy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_union( &tables_copy, &tables, node_mapping, TSK_UNION_NO_CHECK_SHARED); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); // union all shared nodes + subset original nodes = original table ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_union( &tables_copy, &tables, node_mapping, TSK_UNION_NO_CHECK_SHARED); CU_ASSERT_EQUAL_FATAL(ret, 0); node_mapping[0] = 0; node_mapping[1] = 1; node_mapping[2] = 2; ret = tsk_table_collection_subset(&tables_copy, node_mapping, 3, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); // union with one shared node ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); node_mapping[0] = TSK_NULL; node_mapping[1] = TSK_NULL; node_mapping[2] = 2; ret = tsk_table_collection_union(&tables_copy, &tables, node_mapping, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL( tables_copy.populations.num_rows, tables.populations.num_rows + 2); CU_ASSERT_EQUAL_FATAL( tables_copy.individuals.num_rows, tables.individuals.num_rows + 2); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, tables.nodes.num_rows + 2); CU_ASSERT_EQUAL_FATAL(tables_copy.edges.num_rows, tables.edges.num_rows + 2); CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, tables.sites.num_rows); CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, tables.mutations.num_rows + 2); // union with one shared node, but no add pop ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); node_mapping[0] = TSK_NULL; node_mapping[1] = TSK_NULL; node_mapping[2] = 2; ret = tsk_table_collection_union( &tables_copy, &tables, node_mapping, TSK_UNION_NO_ADD_POP); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, tables.populations.num_rows); CU_ASSERT_EQUAL_FATAL( tables_copy.individuals.num_rows, tables.individuals.num_rows + 2); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, tables.nodes.num_rows + 2); CU_ASSERT_EQUAL_FATAL(tables_copy.edges.num_rows, tables.edges.num_rows + 2); CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, tables.sites.num_rows); CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, tables.mutations.num_rows + 2); tsk_table_collection_free(&tables_copy); tsk_table_collection_free(&tables_empty); tsk_table_collection_free(&tables); } static void test_table_collection_disjoint_union(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_table_collection_t tables1; tsk_table_collection_t tables2; tsk_table_collection_t tables12; tsk_id_t node_mapping[4]; tsk_memset(node_mapping, 0xff, sizeof(node_mapping)); ret = tsk_table_collection_init(&tables1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables1.sequence_length = 2; // set up nodes, which will be shared // flags, time, pop, ind, metadata, metadata_length ret_id = tsk_node_table_add_row( &tables1.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row( &tables1.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables1.nodes, 0, 0.5, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables1.nodes, 0, 1.5, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_copy(&tables1, &tables2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // for tables1: // on [0, 1] we have 0, 1 inherit from 2 // left, right, parent, child, metadata, metadata_length ret_id = tsk_edge_table_add_row(&tables1.edges, 0.0, 1.0, 2, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables1.edges, 0.0, 1.0, 2, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables1.sites, 0.4, "T", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables1.mutations, ret_id, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_build_index(&tables1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_sort(&tables1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // all this goes in tables12 so far ret = tsk_table_collection_copy(&tables1, &tables12, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // for tables2; and need to add to tables12 also: // on [1, 2] we have 0, 1 inherit from 3 // left, right, parent, child, metadata, metadata_length ret_id = tsk_edge_table_add_row(&tables2.edges, 1.0, 2.0, 3, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables2.edges, 1.0, 2.0, 3, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables2.sites, 1.4, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables2.mutations, ret_id, 1, TSK_NULL, TSK_UNKNOWN_TIME, "T", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_build_index(&tables2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_sort(&tables2, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // also tables12 ret_id = tsk_edge_table_add_row(&tables12.edges, 1.0, 2.0, 3, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables12.edges, 1.0, 2.0, 3, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables12.sites, 1.4, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables12.mutations, ret_id, 1, TSK_NULL, TSK_UNKNOWN_TIME, "T", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_build_index(&tables12, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_sort(&tables12, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // now disjoint union-ing tables1 and tables2 should get tables12 ret = tsk_table_collection_copy(&tables1, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); node_mapping[0] = 0; node_mapping[1] = 1; node_mapping[2] = 2; node_mapping[3] = 3; ret = tsk_table_collection_union(&tables, &tables2, node_mapping, TSK_UNION_NO_CHECK_SHARED | TSK_UNION_ALL_EDGES | TSK_UNION_ALL_MUTATIONS); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL( tsk_table_collection_equals(&tables, &tables12, TSK_CMP_IGNORE_PROVENANCE)); tsk_table_collection_free(&tables12); tsk_table_collection_free(&tables2); tsk_table_collection_free(&tables1); tsk_table_collection_free(&tables); } static void test_table_collection_union_middle_merge(void) { /* Test ability to have non-shared history both above and below the * shared bits. The full genealogy, in `tu`, is: * 3 4 * \ / * 2 * / \ * 0 1 * and the left lineage is in `ta` and right in `tb` */ int ret; tsk_id_t ret_id; tsk_id_t node_mapping[] = { TSK_NULL, 1, TSK_NULL }; tsk_id_t node_order[] = { 0, 3, 1, 2, 4 }; tsk_table_collection_t ta, tb, tu; ret = tsk_table_collection_init(&ta, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ta.sequence_length = 1; ret = tsk_table_collection_init(&tb, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tb.sequence_length = 1; ret = tsk_table_collection_init(&tu, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tu.sequence_length = 1; ret_id = tsk_node_table_add_row( &tu.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); // node u0 CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row( &ta.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); // node a0 = u0 CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row( &tu.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); // node u1 CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row( &tb.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); // node b0 = u1 CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row( &tu.nodes, 0, 1, TSK_NULL, TSK_NULL, NULL, 0); // node u2 CU_ASSERT(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tu.edges, 0, 1, 2, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tu.edges, 0, 1, 2, 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row( &ta.nodes, 0, 1, TSK_NULL, TSK_NULL, NULL, 0); // node a1 = u2 CU_ASSERT(ret_id >= 0); ret_id = tsk_edge_table_add_row(&ta.edges, 0, 1, 1, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row( &tb.nodes, 0, 1, TSK_NULL, TSK_NULL, NULL, 0); // node b1 = u2 CU_ASSERT(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tb.edges, 0, 1, 1, 0, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row( &tu.nodes, 0, 2, TSK_NULL, TSK_NULL, NULL, 0); // node u3 CU_ASSERT(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tu.edges, 0, 0.5, 3, 2, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row( &ta.nodes, 0, 2, TSK_NULL, TSK_NULL, NULL, 0); // node a2 = u3 CU_ASSERT(ret_id >= 0); ret_id = tsk_edge_table_add_row(&ta.edges, 0, 0.5, 2, 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row( &tu.nodes, 0, 2, TSK_NULL, TSK_NULL, NULL, 0); // node u4 CU_ASSERT(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tu.edges, 0.5, 1, 4, 2, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_node_table_add_row( &tb.nodes, 0, 2, TSK_NULL, TSK_NULL, NULL, 0); // node b2 = u4 CU_ASSERT(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tb.edges, 0.5, 1, 2, 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_site_table_add_row(&ta.sites, 0.25, "A", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_site_table_add_row(&ta.sites, 0.75, "X", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_site_table_add_row(&tb.sites, 0.25, "A", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_site_table_add_row(&tb.sites, 0.75, "X", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_site_table_add_row(&tu.sites, 0.25, "A", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_site_table_add_row(&tu.sites, 0.75, "X", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tu.mutations, 0, 3, TSK_NULL, 3.5, "B", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &ta.mutations, 0, 2, TSK_NULL, 3.5, "B", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tu.mutations, 0, 2, TSK_NULL, 1.5, "D", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &ta.mutations, 0, 1, TSK_NULL, 1.5, "D", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tb.mutations, 0, 1, TSK_NULL, 1.5, "D", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tu.mutations, 0, 2, TSK_NULL, 1.2, "E", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &ta.mutations, 0, 1, TSK_NULL, 1.2, "E", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tb.mutations, 0, 1, TSK_NULL, 1.2, "E", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tu.mutations, 0, 0, TSK_NULL, 0.5, "C", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &ta.mutations, 0, 0, TSK_NULL, 0.5, "C", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tu.mutations, 1, 4, TSK_NULL, 2.4, "Y", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tb.mutations, 1, 2, TSK_NULL, 2.4, "Y", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tu.mutations, 1, 1, TSK_NULL, 0.4, "Z", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tb.mutations, 1, 0, TSK_NULL, 0.4, "Z", 1, NULL, 0); CU_ASSERT(ret_id >= 0); ret = tsk_table_collection_build_index(&ta, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_compute_mutation_parents(&ta, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_build_index(&tb, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_compute_mutation_parents(&tb, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_build_index(&tu, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_compute_mutation_parents(&tu, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_union(&ta, &tb, node_mapping, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_table_collection_subset(&ta, node_order, 5, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_provenance_table_clear(&ta.provenances); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tu, &ta, 0)); tsk_table_collection_free(&ta); tsk_table_collection_free(&tb); tsk_table_collection_free(&tu); } static void test_table_collection_union_errors(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_table_collection_t tables_copy; tsk_id_t node_mapping[] = { 0, 1 }; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret = tsk_table_collection_init(&tables_copy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // two nodes, two pop, two ind, one edge, one site, one mut ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.5, 1, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.2, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); // trigger diff histories error ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_mutation_table_add_row( &tables_copy.mutations, 0, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_union(&tables_copy, &tables, node_mapping, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNION_DIFF_HISTORIES); // Migrations are not supported ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_migration_table_add_row(&tables_copy.migrations, 0, 1, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.migrations.num_rows, 1); ret = tsk_table_collection_union( &tables_copy, &tables, node_mapping, TSK_UNION_NO_CHECK_SHARED); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED); // test out of bounds node_mapping node_mapping[0] = -4; node_mapping[1] = 6; ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_union(&tables_copy, &tables, node_mapping, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNION_BAD_MAP); // check integrity node_mapping[0] = 0; node_mapping[1] = 1; ret_id = tsk_node_table_add_row( &tables_copy.nodes, TSK_NODE_IS_SAMPLE, 0.0, -2, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_union(&tables_copy, &tables, node_mapping, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, -2, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_union(&tables, &tables_copy, node_mapping, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tsk_table_collection_free(&tables_copy); tsk_table_collection_free(&tables); } static void test_table_collection_clear_with_options(tsk_flags_t options) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; bool clear_provenance = !!(options & TSK_CLEAR_PROVENANCE); bool clear_metadata_schemas = !!(options & TSK_CLEAR_METADATA_SCHEMAS); bool clear_ts_metadata = !!(options & TSK_CLEAR_TS_METADATA_AND_SCHEMA); tsk_bookmark_t num_rows; tsk_bookmark_t expected_rows = { .provenances = clear_provenance ? 0 : 1 }; tsk_size_t expected_len = clear_metadata_schemas ? 0 : 4; tsk_size_t expected_len_ts = clear_ts_metadata ? 0 : 4; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.5, 1, 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_site_table_add_row(&tables.sites, 0.2, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 0, 0, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_individual_table_set_metadata_schema(&tables.individuals, "test", 4); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_set_metadata_schema(&tables.nodes, "test", 4); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_edge_table_set_metadata_schema(&tables.edges, "test", 4); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_set_metadata_schema(&tables.migrations, "test", 4); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_set_metadata_schema(&tables.sites, "test", 4); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_set_metadata_schema(&tables.mutations, "test", 4); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_population_table_set_metadata_schema(&tables.populations, "test", 4); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_time_units(&tables, "test", 4); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_metadata(&tables, "test", 4); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_metadata_schema(&tables, "test", 4); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_provenance_table_add_row(&tables.provenances, "today", 5, "test", 4); CU_ASSERT_FATAL(ret_id >= 0); ret = tsk_table_collection_clear(&tables, options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_record_num_rows(&tables, &num_rows); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(num_rows.individuals, expected_rows.individuals); CU_ASSERT_EQUAL(num_rows.nodes, expected_rows.nodes); CU_ASSERT_EQUAL(num_rows.edges, expected_rows.edges); CU_ASSERT_EQUAL(num_rows.migrations, expected_rows.migrations); CU_ASSERT_EQUAL(num_rows.sites, expected_rows.sites); CU_ASSERT_EQUAL(num_rows.mutations, expected_rows.mutations); CU_ASSERT_EQUAL(num_rows.populations, expected_rows.populations); CU_ASSERT_EQUAL(num_rows.provenances, expected_rows.provenances); CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)); CU_ASSERT_EQUAL(tables.individuals.metadata_schema_length, expected_len); CU_ASSERT_EQUAL(tables.nodes.metadata_schema_length, expected_len); CU_ASSERT_EQUAL(tables.edges.metadata_schema_length, expected_len); CU_ASSERT_EQUAL(tables.migrations.metadata_schema_length, expected_len); CU_ASSERT_EQUAL(tables.sites.metadata_schema_length, expected_len); CU_ASSERT_EQUAL(tables.mutations.metadata_schema_length, expected_len); CU_ASSERT_EQUAL(tables.populations.metadata_schema_length, expected_len); CU_ASSERT_EQUAL(tables.metadata_schema_length, expected_len_ts); CU_ASSERT_EQUAL(tables.metadata_length, expected_len_ts); CU_ASSERT_EQUAL(tables.time_units_length, 4); tsk_table_collection_free(&tables); } static void test_table_collection_clear(void) { test_table_collection_clear_with_options(0); test_table_collection_clear_with_options(TSK_CLEAR_PROVENANCE); test_table_collection_clear_with_options(TSK_CLEAR_METADATA_SCHEMAS); test_table_collection_clear_with_options(TSK_CLEAR_TS_METADATA_AND_SCHEMA); test_table_collection_clear_with_options( TSK_CLEAR_PROVENANCE | TSK_CLEAR_METADATA_SCHEMAS); test_table_collection_clear_with_options( TSK_CLEAR_PROVENANCE | TSK_CLEAR_TS_METADATA_AND_SCHEMA); test_table_collection_clear_with_options( TSK_CLEAR_METADATA_SCHEMAS | TSK_CLEAR_TS_METADATA_AND_SCHEMA); test_table_collection_clear_with_options(TSK_CLEAR_PROVENANCE | TSK_CLEAR_METADATA_SCHEMAS | TSK_CLEAR_TS_METADATA_AND_SCHEMA); } static void test_table_collection_takeset_indexes(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t t1, t2; tsk_id_t *ins; tsk_id_t *rem; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ins = tsk_malloc(t1.edges.num_rows * sizeof(*ins)); CU_ASSERT_FATAL(ins != NULL); rem = tsk_malloc(t1.edges.num_rows * sizeof(*rem)); CU_ASSERT_FATAL(rem != NULL); memcpy(ins, t1.indexes.edge_insertion_order, (size_t) (t1.edges.num_rows * sizeof(*ins))); memcpy( rem, t1.indexes.edge_removal_order, (size_t) (t1.edges.num_rows * sizeof(*rem))); ret = tsk_table_collection_copy(&t1, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_drop_index(&t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_takeset_indexes(&t2, ins, rem); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL( tsk_memcmp(t1.indexes.edge_insertion_order, t2.indexes.edge_insertion_order, t1.edges.num_rows * sizeof(*ins)), 0); CU_ASSERT_EQUAL(tsk_memcmp(t1.indexes.edge_removal_order, t2.indexes.edge_removal_order, t1.edges.num_rows * sizeof(*rem)), 0); ret = tsk_table_collection_takeset_indexes(&t2, ins, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_table_collection_takeset_indexes(&t2, NULL, rem); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_table_collection_free(&t1); tsk_table_collection_free(&t2); tsk_treeseq_free(&ts); } static void test_table_collection_delete_older(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t t; const char *mutations = "0 2 1 -1\n" "0 2 0 0\n" "1 0 1 -1\n" "2 5 1 -1\n"; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &t, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); /* Add some migrations */ tsk_population_table_add_row(&t.populations, NULL, 0); tsk_population_table_add_row(&t.populations, NULL, 0); tsk_migration_table_add_row(&t.migrations, 0, 10, 0, 0, 1, 0.05, NULL, 0); tsk_migration_table_add_row(&t.migrations, 0, 10, 0, 1, 0, 0.09, NULL, 0); tsk_migration_table_add_row(&t.migrations, 0, 10, 0, 0, 1, 0.10, NULL, 0); CU_ASSERT_EQUAL(t.migrations.num_rows, 3); /* Note: trees 1 and 2 are identical now * 0.09┊ 5 ┊ 5 ┊ 5 ┊ ┊ ┏┻┓ ┊ ┏━┻┓ ┊ ┏━┻┓ ┊ 0.07┊ ┃ ┃ ┊ ┃ 4 ┊ ┃ 4 ┊ ┊ ┃ ┃ ┊ ┃ ┏┻┓ ┊ ┃ ┏┻┓ ┊ 0.00┊ 0 1 3 2 ┊ 0 1 2 3 ┊ 0 1 2 3 ┊ 0.00 2.00 7.00 10.00 */ ret = tsk_table_collection_delete_older(&t, 0.09, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, &t, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 9); /* Lost the mutation over 5 */ CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 3); /* We delete the migration at exactly 0.09. */ CU_ASSERT_EQUAL(tsk_treeseq_get_num_migrations(&ts), 1); tsk_table_collection_free(&t); tsk_treeseq_free(&ts); } int main(int argc, char **argv) { CU_TestInfo tests[] = { { "test_node_table", test_node_table }, { "test_node_table_update_row", test_node_table_update_row }, { "test_node_table_keep_rows", test_node_table_keep_rows }, { "test_node_table_takeset", test_node_table_takeset }, { "test_edge_table", test_edge_table }, { "test_edge_table_update_row", test_edge_table_update_row }, { "test_edge_table_update_row_no_metadata", test_edge_table_update_row_no_metadata }, { "test_edge_table_keep_rows", test_edge_table_keep_rows }, { "test_edge_table_keep_rows_no_metadata", test_edge_table_keep_rows_no_metadata }, { "test_edge_table_takeset", test_edge_table_takeset }, { "test_edge_table_copy_semantics", test_edge_table_copy_semantics }, { "test_edge_table_squash", test_edge_table_squash }, { "test_edge_table_squash_multiple_parents", test_edge_table_squash_multiple_parents }, { "test_edge_table_squash_empty", test_edge_table_squash_empty }, { "test_edge_table_squash_single_edge", test_edge_table_squash_single_edge }, { "test_edge_table_squash_bad_intervals", test_edge_table_squash_bad_intervals }, { "test_edge_table_squash_metadata", test_edge_table_squash_metadata }, { "test_site_table", test_site_table }, { "test_site_table_update_row", test_site_table_update_row }, { "test_site_table_keep_rows", test_site_table_keep_rows }, { "test_site_table_takeset", test_site_table_takeset }, { "test_mutation_table", test_mutation_table }, { "test_mutation_table_update_row", test_mutation_table_update_row }, { "test_mutation_table_takeset", test_mutation_table_takeset }, { "test_mutation_table_keep_rows", test_mutation_table_keep_rows }, { "test_mutation_table_keep_rows_parent_references", test_mutation_table_keep_rows_parent_references }, { "test_migration_table", test_migration_table }, { "test_migration_table_update_row", test_migration_table_update_row }, { "test_migration_table_keep_rows", test_migration_table_keep_rows }, { "test_migration_table_takeset", test_migration_table_takeset }, { "test_individual_table", test_individual_table }, { "test_individual_table_takeset", test_individual_table_takeset }, { "test_individual_table_update_row", test_individual_table_update_row }, { "test_individual_table_keep_rows", test_individual_table_keep_rows }, { "test_individual_table_keep_rows_parent_references", test_individual_table_keep_rows_parent_references }, { "test_population_table", test_population_table }, { "test_population_table_update_row", test_population_table_update_row }, { "test_population_table_keep_rows", test_population_table_keep_rows }, { "test_population_table_takeset", test_population_table_takeset }, { "test_provenance_table", test_provenance_table }, { "test_provenance_table_update_row", test_provenance_table_update_row }, { "test_provenance_table_keep_rows", test_provenance_table_keep_rows }, { "test_provenance_table_takeset", test_provenance_table_takeset }, { "test_table_size_increments", test_table_size_increments }, { "test_table_expansion", test_table_expansion }, { "test_ragged_expansion", test_ragged_expansion }, { "test_table_collection_equals_options", test_table_collection_equals_options }, { "test_table_collection_simplify_errors", test_table_collection_simplify_errors }, { "test_table_collection_time_units", test_table_collection_time_units }, { "test_table_collection_reference_sequence", test_table_collection_reference_sequence }, { "test_table_collection_has_reference_sequence", test_table_collection_has_reference_sequence }, { "test_table_collection_metadata", test_table_collection_metadata }, { "test_reference_sequence_state_machine", test_reference_sequence_state_machine }, { "test_reference_sequence_take", test_reference_sequence_take }, { "test_reference_sequence", test_reference_sequence }, { "test_simplify_tables_drops_indexes", test_simplify_tables_drops_indexes }, { "test_simplify_empty_tables", test_simplify_empty_tables }, { "test_simplify_metadata", test_simplify_metadata }, { "test_link_ancestors_no_edges", test_link_ancestors_no_edges }, { "test_link_ancestors_input_errors", test_link_ancestors_input_errors }, { "test_link_ancestors_single_tree", test_link_ancestors_single_tree }, { "test_link_ancestors_paper", test_link_ancestors_paper }, { "test_link_ancestors_samples_and_ancestors_overlap", test_link_ancestors_samples_and_ancestors_overlap }, { "test_link_ancestors_multiple_to_single_tree", test_link_ancestors_multiple_to_single_tree }, { "test_ibd_segments_debug", test_ibd_segments_debug }, { "test_ibd_segments_caterpillar_tree", test_ibd_segments_caterpillar_tree }, { "test_ibd_segments_single_tree", test_ibd_segments_single_tree }, { "test_ibd_segments_single_tree_options", test_ibd_segments_single_tree_options }, { "test_ibd_segments_multiple_trees", test_ibd_segments_multiple_trees }, { "test_ibd_segments_empty_result", test_ibd_segments_empty_result }, { "test_ibd_segments_min_span_max_time", test_ibd_segments_min_span_max_time }, { "test_ibd_segments_single_tree_between", test_ibd_segments_single_tree_between }, { "test_ibd_segments_samples_are_descendants", test_ibd_segments_samples_are_descendants }, { "test_ibd_segments_multiple_ibd_paths", test_ibd_segments_multiple_ibd_paths }, { "test_ibd_segments_odd_topologies", test_ibd_segments_odd_topologies }, { "test_ibd_segments_errors", test_ibd_segments_errors }, { "test_sorter_interface", test_sorter_interface }, { "test_sort_tables_canonical_errors", test_sort_tables_canonical_errors }, { "test_sort_tables_canonical", test_sort_tables_canonical }, { "test_sort_tables_drops_indexes", test_sort_tables_drops_indexes }, { "test_sort_tables_edge_metadata", test_sort_tables_edge_metadata }, { "test_sort_tables_errors", test_sort_tables_errors }, { "test_sort_tables_individuals", test_sort_tables_individuals }, { "test_sort_tables_mutation_times", test_sort_tables_mutation_times }, { "test_sort_tables_mutations", test_sort_tables_mutations }, { "test_sort_tables_migrations", test_sort_tables_migrations }, { "test_sort_tables_no_edge_metadata", test_sort_tables_no_edge_metadata }, { "test_sort_tables_offsets", test_sort_tables_offsets }, { "test_edge_update_invalidates_index", test_edge_update_invalidates_index }, { "test_copy_table_collection", test_copy_table_collection }, { "test_dump_unindexed", test_dump_unindexed }, { "test_dump_load_empty", test_dump_load_empty }, { "test_dump_load_unsorted", test_dump_load_unsorted }, { "test_dump_load_metadata_schema", test_dump_load_metadata_schema }, { "test_dump_fail_no_file", test_dump_fail_no_file }, { "test_load_reindex", test_load_reindex }, { "test_table_overflow", test_table_overflow }, { "test_column_overflow", test_column_overflow }, { "test_table_collection_check_integrity", test_table_collection_check_integrity }, { "test_table_collection_check_integrity_no_populations", test_table_collection_check_integrity_no_populations }, { "test_table_collection_check_integrity_bad_indexes_example", test_table_collection_check_integrity_bad_indexes_example }, { "test_table_collection_check_integrity_bad_indexes", test_table_collection_check_integrity_bad_indexes }, { "test_check_integrity_bad_mutation_parent_topology", test_check_integrity_bad_mutation_parent_topology }, { "test_table_collection_compute_mutation_parents_tolerates_invalid_input", test_table_collection_compute_mutation_parents_tolerates_invalid_input }, { "test_table_collection_compute_mutation_parents_restores_on_error", test_table_collection_compute_mutation_parents_restores_on_error }, { "test_table_collection_subset", test_table_collection_subset }, { "test_table_collection_subset_unsorted", test_table_collection_subset_unsorted }, { "test_table_collection_subset_errors", test_table_collection_subset_errors }, { "test_table_collection_union", test_table_collection_union }, { "test_table_collection_disjoint_union", test_table_collection_disjoint_union }, { "test_table_collection_union_middle_merge", test_table_collection_union_middle_merge }, { "test_table_collection_union_errors", test_table_collection_union_errors }, { "test_table_collection_clear", test_table_collection_clear }, { "test_table_collection_takeset_indexes", test_table_collection_takeset_indexes }, { "test_table_collection_delete_older", test_table_collection_delete_older }, { NULL, NULL }, }; return test_main(tests, argc, argv); } ================================================ FILE: c/tests/test_trees.c ================================================ /* * MIT License * * Copyright (c) 2019-2024 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "testlib.h" #include #include #include #include /*======================================================= * Verification utilities. *======================================================*/ /* Checks if the specified trees are topologically equivalent, i.e, represent * the same tree without checking state specific to seeking.*/ static void check_trees_equal(tsk_tree_t *self, tsk_tree_t *other) { tsk_size_t N = self->num_nodes; CU_ASSERT_FATAL(self->tree_sequence == other->tree_sequence); CU_ASSERT_FATAL(self->index == other->index); CU_ASSERT_FATAL(self->interval.left == other->interval.left); CU_ASSERT_FATAL(self->interval.right == other->interval.right); CU_ASSERT_FATAL(self->sites_length == other->sites_length); CU_ASSERT_FATAL(self->sites == other->sites); CU_ASSERT_FATAL(self->samples == other->samples); CU_ASSERT_FATAL(self->num_edges == other->num_edges); CU_ASSERT_FATAL(tsk_memcmp(self->parent, other->parent, N * sizeof(tsk_id_t)) == 0); CU_ASSERT_FATAL(tsk_tree_equals(self, other)); } static void check_trees_identical(tsk_tree_t *self, tsk_tree_t *other) { tsk_size_t N = self->num_nodes; check_trees_equal(self, other); CU_ASSERT_FATAL(self->left_index == other->left_index); CU_ASSERT_FATAL(self->right_index == other->right_index); CU_ASSERT_FATAL(self->direction == other->direction); CU_ASSERT_FATAL( tsk_memcmp(self->left_child, other->left_child, N * sizeof(tsk_id_t)) == 0); CU_ASSERT_FATAL( tsk_memcmp(self->right_child, other->right_child, N * sizeof(tsk_id_t)) == 0); CU_ASSERT_FATAL( tsk_memcmp(self->left_sib, other->left_sib, N * sizeof(tsk_id_t)) == 0); CU_ASSERT_FATAL( tsk_memcmp(self->right_sib, other->right_sib, N * sizeof(tsk_id_t)) == 0); CU_ASSERT_FATAL( tsk_memcmp(self->num_children, other->num_children, N * sizeof(tsk_id_t)) == 0); CU_ASSERT_FATAL(tsk_memcmp(self->edge, other->edge, N * sizeof(tsk_id_t)) == 0); CU_ASSERT_EQUAL_FATAL(self->num_samples == NULL, other->num_samples == NULL) CU_ASSERT_EQUAL_FATAL( self->num_tracked_samples == NULL, other->num_tracked_samples == NULL) if (self->num_samples != NULL) { CU_ASSERT_FATAL(tsk_memcmp(self->num_samples, other->num_samples, N * sizeof(*self->num_samples)) == 0); CU_ASSERT_FATAL(tsk_memcmp(self->num_tracked_samples, other->num_tracked_samples, N * sizeof(*self->num_tracked_samples)) == 0); } CU_ASSERT_EQUAL_FATAL(self->left_sample == NULL, other->left_sample == NULL) CU_ASSERT_EQUAL_FATAL(self->right_sample == NULL, other->left_sample == NULL) CU_ASSERT_EQUAL_FATAL(self->next_sample == NULL, other->next_sample == NULL) if (self->left_sample != NULL) { CU_ASSERT_FATAL(tsk_memcmp(self->left_sample, other->left_sample, N * sizeof(*self->left_sample)) == 0); CU_ASSERT_FATAL(tsk_memcmp(self->right_sample, other->right_sample, N * sizeof(*self->right_sample)) == 0); CU_ASSERT_FATAL( tsk_memcmp(self->next_sample, other->next_sample, self->tree_sequence->num_samples * sizeof(*self->next_sample)) == 0); } } static void verify_compute_mutation_parents(tsk_treeseq_t *ts) { int ret; tsk_size_t size = tsk_treeseq_get_num_mutations(ts) * sizeof(tsk_id_t); tsk_id_t *parent = tsk_malloc(size); tsk_table_collection_t tables; CU_ASSERT_FATAL(parent != NULL); ret = tsk_treeseq_copy_tables(ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memcpy(parent, tables.mutations.parent, size); /* tsk_table_collection_print_state(&tables, stdout); */ /* Make sure the tables are actually updated */ tsk_memset(tables.mutations.parent, 0xff, size); ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tsk_memcmp(parent, tables.mutations.parent, size), 0); /* printf("after\n"); */ /* tsk_table_collection_print_state(&tables, stdout); */ free(parent); tsk_table_collection_free(&tables); } static void verify_compute_mutation_times(tsk_treeseq_t *ts) { int ret; tsk_size_t j; tsk_size_t size = tsk_treeseq_get_num_mutations(ts) * sizeof(tsk_id_t); tsk_id_t *time = tsk_malloc(size); tsk_table_collection_t tables; CU_ASSERT_FATAL(time != NULL); ret = tsk_treeseq_copy_tables(ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memcpy(time, tables.mutations.time, size); /* Time should be set to TSK_UNKNOWN_TIME before computing */ for (j = 0; j < size; j++) { tables.mutations.time[j] = TSK_UNKNOWN_TIME; } ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tsk_memcmp(time, tables.mutations.time, size), 0); free(time); tsk_table_collection_free(&tables); } static void verify_individual_nodes(tsk_treeseq_t *ts) { int ret; tsk_individual_t individual; tsk_id_t k; tsk_size_t num_nodes = tsk_treeseq_get_num_nodes(ts); tsk_size_t num_individuals = tsk_treeseq_get_num_individuals(ts); tsk_size_t j; for (k = 0; k < (tsk_id_t) num_individuals; k++) { ret = tsk_treeseq_get_individual(ts, k, &individual); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < individual.nodes_length; j++) { CU_ASSERT_FATAL(individual.nodes[j] < (tsk_id_t) num_nodes); CU_ASSERT_EQUAL_FATAL(k, ts->tables->nodes.individual[individual.nodes[j]]); } } } static void verify_tree_pos(const tsk_treeseq_t *ts, tsk_size_t num_trees, tsk_id_t *tree_parents) { int ret; const tsk_size_t N = tsk_treeseq_get_num_nodes(ts); const tsk_id_t *edges_parent = ts->tables->edges.parent; const tsk_id_t *edges_child = ts->tables->edges.child; const double *restrict edges_left = ts->tables->edges.left; const double *restrict edges_right = ts->tables->edges.right; tsk_tree_position_t tree_pos; tsk_id_t *known_parent; tsk_id_t *parent = tsk_malloc(N * sizeof(*parent)); tsk_id_t u, index, j, e; bool valid; CU_ASSERT_FATAL(parent != NULL); ret = tsk_tree_position_init(&tree_pos, ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (u = 0; u < (tsk_id_t) N; u++) { parent[u] = TSK_NULL; } for (index = 0; index < (tsk_id_t) num_trees; index++) { known_parent = tree_parents + N * (tsk_size_t) index; valid = tsk_tree_position_next(&tree_pos); CU_ASSERT_TRUE(valid); CU_ASSERT_EQUAL(index, tree_pos.index); for (j = tree_pos.out.start; j < tree_pos.out.stop; j++) { e = tree_pos.out.order[j]; parent[edges_child[e]] = TSK_NULL; } for (j = tree_pos.in.start; j < tree_pos.in.stop; j++) { e = tree_pos.in.order[j]; parent[edges_child[e]] = edges_parent[e]; } for (u = 0; u < (tsk_id_t) N; u++) { CU_ASSERT_EQUAL(parent[u], known_parent[u]); } } valid = tsk_tree_position_next(&tree_pos); CU_ASSERT_FALSE(valid); for (j = tree_pos.out.start; j < tree_pos.out.stop; j++) { e = tree_pos.out.order[j]; parent[edges_child[e]] = TSK_NULL; } for (u = 0; u < (tsk_id_t) N; u++) { CU_ASSERT_EQUAL(parent[u], TSK_NULL); } for (index = (tsk_id_t) num_trees - 1; index >= 0; index--) { known_parent = tree_parents + N * (tsk_size_t) index; valid = tsk_tree_position_prev(&tree_pos); CU_ASSERT_TRUE(valid); CU_ASSERT_EQUAL(index, tree_pos.index); for (j = tree_pos.out.start; j > tree_pos.out.stop; j--) { e = tree_pos.out.order[j]; parent[edges_child[e]] = TSK_NULL; } for (j = tree_pos.in.start; j > tree_pos.in.stop; j--) { CU_ASSERT_FATAL(j >= 0); e = tree_pos.in.order[j]; parent[edges_child[e]] = edges_parent[e]; } for (u = 0; u < (tsk_id_t) N; u++) { CU_ASSERT_EQUAL(parent[u], known_parent[u]); } } valid = tsk_tree_position_prev(&tree_pos); CU_ASSERT_FALSE(valid); for (j = tree_pos.out.start; j > tree_pos.out.stop; j--) { e = tree_pos.out.order[j]; parent[edges_child[e]] = TSK_NULL; } for (u = 0; u < (tsk_id_t) N; u++) { CU_ASSERT_EQUAL(parent[u], TSK_NULL); } for (index = 0; index < (tsk_id_t) num_trees; index++) { known_parent = tree_parents + N * (tsk_size_t) index; ret = tsk_tree_position_init(&tree_pos, ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_position_seek_forward(&tree_pos, index); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(index, tree_pos.index); for (j = tree_pos.in.start; j != tree_pos.in.stop; j++) { e = tree_pos.in.order[j]; if (edges_left[e] <= tree_pos.interval.left && tree_pos.interval.left < edges_right[e]) { parent[edges_child[e]] = edges_parent[e]; } } for (u = 0; u < (tsk_id_t) N; u++) { CU_ASSERT_EQUAL(parent[u], known_parent[u]); } tsk_tree_position_free(&tree_pos); for (u = 0; u < (tsk_id_t) N; u++) { parent[u] = TSK_NULL; } } valid = tsk_tree_position_next(&tree_pos); CU_ASSERT_FALSE(valid); for (index = (tsk_id_t) num_trees - 1; index >= 0; index--) { known_parent = tree_parents + N * (tsk_size_t) index; ret = tsk_tree_position_init(&tree_pos, ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_position_seek_backward(&tree_pos, index); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(index, tree_pos.index); for (j = tree_pos.in.start; j != tree_pos.in.stop; j--) { e = tree_pos.in.order[j]; if (edges_right[e] >= tree_pos.interval.right && tree_pos.interval.right > edges_left[e]) { parent[edges_child[e]] = edges_parent[e]; } } for (u = 0; u < (tsk_id_t) N; u++) { CU_ASSERT_EQUAL(parent[u], known_parent[u]); } for (u = 0; u < (tsk_id_t) N; u++) { parent[u] = TSK_NULL; } tsk_tree_position_free(&tree_pos); } tsk_safe_free(parent); } static void verify_trees(tsk_treeseq_t *ts, tsk_size_t num_trees, tsk_id_t *parents) { int ret; tsk_id_t u, j, v; uint32_t mutation_index, site_index; tsk_size_t k, l, tree_sites_length; const tsk_site_t *sites = NULL; tsk_tree_t tree, skip_tree; tsk_size_t num_edges; tsk_size_t num_nodes = tsk_treeseq_get_num_nodes(ts); tsk_size_t num_sites = tsk_treeseq_get_num_sites(ts); tsk_size_t num_mutations = tsk_treeseq_get_num_mutations(ts); const double *breakpoints = tsk_treeseq_get_breakpoints(ts); ret = tsk_tree_init(&tree, ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_init(&skip_tree, ts, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(ts), num_trees); CU_ASSERT_EQUAL(tree.index, -1); site_index = 0; mutation_index = 0; j = 0; for (ret = tsk_tree_first(&tree); ret == TSK_TREE_OK; ret = tsk_tree_next(&tree)) { CU_ASSERT_EQUAL(j, (tsk_id_t) tree.index); tsk_tree_print_state(&tree, _devnull); /* tsk_tree_print_state(&tree, stdout); */ CU_ASSERT_EQUAL(tree.interval.left, breakpoints[j]); num_edges = 0; for (u = 0; u < (tsk_id_t) num_nodes; u++) { ret = tsk_tree_get_parent(&tree, u, &v); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(v, parents[j * (tsk_id_t) num_nodes + u]); if (v != TSK_NULL) { num_edges++; } } CU_ASSERT_EQUAL(num_edges, tree.num_edges); ret = tsk_tree_get_sites(&tree, &sites, &tree_sites_length); CU_ASSERT_EQUAL(ret, 0); for (k = 0; k < tree_sites_length; k++) { CU_ASSERT_EQUAL(sites[k].id, (tsk_id_t) site_index); for (l = 0; l < sites[k].mutations_length; l++) { CU_ASSERT_EQUAL(sites[k].mutations[l].id, (tsk_id_t) mutation_index); CU_ASSERT_EQUAL(sites[k].mutations[l].site, (tsk_id_t) site_index); mutation_index++; } site_index++; } /* Check the skip tree */ ret = tsk_tree_first(&skip_tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); ret = tsk_tree_seek(&skip_tree, breakpoints[j], TSK_SEEK_SKIP); CU_ASSERT_EQUAL(ret, 0); /* Calling print_state here also verifies the integrity of the tree */ tsk_tree_print_state(&skip_tree, _devnull); check_trees_equal(&tree, &skip_tree); ret = tsk_tree_last(&skip_tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); ret = tsk_tree_seek(&skip_tree, breakpoints[j], TSK_SEEK_SKIP); CU_ASSERT_EQUAL(ret, 0); tsk_tree_print_state(&skip_tree, _devnull); check_trees_equal(&tree, &skip_tree); j++; } CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(site_index, num_sites); CU_ASSERT_EQUAL(mutation_index, num_mutations); CU_ASSERT_EQUAL(tree.index, -1); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(ts), breakpoints[j]); tsk_tree_free(&tree); tsk_tree_free(&skip_tree); verify_tree_pos(ts, num_trees, parents); } static tsk_tree_t * get_tree_list(tsk_treeseq_t *ts) { int ret; tsk_tree_t t, *trees; tsk_size_t num_trees; num_trees = tsk_treeseq_get_num_trees(ts); ret = tsk_tree_init(&t, ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); trees = tsk_malloc(num_trees * sizeof(tsk_tree_t)); CU_ASSERT_FATAL(trees != NULL); for (ret = tsk_tree_first(&t); ret == TSK_TREE_OK; ret = tsk_tree_next(&t)) { CU_ASSERT_FATAL(t.index < (tsk_id_t) num_trees); ret = tsk_tree_copy(&t, &trees[t.index], 0); CU_ASSERT_EQUAL_FATAL(ret, 0); check_trees_equal(&trees[t.index], &t); /* Make sure the left and right coordinates are also OK */ CU_ASSERT_EQUAL(trees[t.index].interval.left, t.interval.left); CU_ASSERT_EQUAL(trees[t.index].interval.right, t.interval.right); } CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_free(&t); CU_ASSERT_EQUAL_FATAL(ret, 0); return trees; } static void verify_tree_next_prev(tsk_treeseq_t *ts) { int ret; tsk_tree_t *trees, t; tsk_id_t j; tsk_id_t num_trees = (tsk_id_t) tsk_treeseq_get_num_trees(ts); trees = get_tree_list(ts); ret = tsk_tree_init(&t, ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Single forward pass */ j = 0; for (ret = tsk_tree_first(&t); ret == TSK_TREE_OK; ret = tsk_tree_next(&t)) { CU_ASSERT_EQUAL_FATAL(j, t.index); check_trees_equal(&t, &trees[t.index]); j++; } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(j, num_trees); /* Single reverse pass */ j = num_trees; for (ret = tsk_tree_last(&t); ret == TSK_TREE_OK; ret = tsk_tree_prev(&t)) { CU_ASSERT_EQUAL_FATAL(j - 1, t.index); check_trees_equal(&t, &trees[t.index]); j--; } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(j, 0); /* Full forward, then reverse */ j = 0; for (ret = tsk_tree_first(&t); ret == TSK_TREE_OK; ret = tsk_tree_next(&t)) { CU_ASSERT_EQUAL_FATAL(j, t.index); check_trees_equal(&t, &trees[t.index]); j++; } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(j, num_trees); while ((ret = tsk_tree_prev(&t)) == TSK_TREE_OK) { CU_ASSERT_EQUAL_FATAL(j - 1, t.index); check_trees_equal(&t, &trees[t.index]); j--; } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(j, 0); CU_ASSERT_EQUAL_FATAL(t.index, -1); /* Full reverse then forward */ j = num_trees; for (ret = tsk_tree_last(&t); ret == TSK_TREE_OK; ret = tsk_tree_prev(&t)) { CU_ASSERT_EQUAL_FATAL(j - 1, t.index); check_trees_equal(&t, &trees[t.index]); j--; } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(j, 0); while ((ret = tsk_tree_next(&t)) == TSK_TREE_OK) { CU_ASSERT_EQUAL_FATAL(j, t.index); check_trees_equal(&t, &trees[t.index]); j++; } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(j, num_trees); CU_ASSERT_EQUAL_FATAL(t.index, -1); /* Do a zigzagging traversal */ ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); for (j = 1; j < TSK_MIN(10, num_trees / 2); j++) { while (t.index < num_trees - j) { ret = tsk_tree_next(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); } CU_ASSERT_EQUAL_FATAL(t.index, num_trees - j); check_trees_equal(&t, &trees[t.index]); while (t.index > j) { ret = tsk_tree_prev(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); } CU_ASSERT_EQUAL_FATAL(t.index, j); check_trees_equal(&t, &trees[t.index]); } ret = tsk_tree_clear(&t); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Calling next() on a cleared tree should be the same as first() */ j = 0; while ((ret = tsk_tree_next(&t)) == TSK_TREE_OK) { CU_ASSERT_EQUAL_FATAL(j, t.index); check_trees_equal(&t, &trees[t.index]); j++; } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(j, num_trees); ret = tsk_tree_free(&t); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_init(&t, ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Calling prev() on an uninitialised tree should be the same as last() */ j = num_trees; while ((ret = tsk_tree_prev(&t)) == TSK_TREE_OK) { CU_ASSERT_EQUAL_FATAL(j - 1, t.index); check_trees_equal(&t, &trees[t.index]); j--; } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(j, 0); /* Free the trees. */ ret = tsk_tree_free(&t); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) tsk_treeseq_get_num_trees(ts); j++) { tsk_tree_free(&trees[j]); } free(trees); } static void verify_edge_array_single_tree( tsk_tree_t *tree, tsk_edge_table_t *edge_table, tsk_size_t num_nodes) { int ret; tsk_id_t c, edge_id; tsk_edge_t edge; tsk_size_t count_edges = 0; for (c = 0; c <= (tsk_id_t) num_nodes; c++) { edge_id = tree->edge[c]; if (edge_id == TSK_NULL) { /*c is either (virtual) root, or is not associated with an edge along this tree */ CU_ASSERT_EQUAL(tree->parent[c], TSK_NULL); } else { ret = tsk_edge_table_get_row(edge_table, edge_id, &edge); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(edge.id, edge_id); CU_ASSERT_EQUAL(edge.parent, tree->parent[c]); CU_ASSERT_EQUAL(edge.child, c); count_edges++; } } CU_ASSERT_EQUAL(count_edges, tree->num_edges); } static void verify_edge_array_trees(tsk_treeseq_t *ts) { int ret; tsk_tree_t t; tsk_edge_table_t edge_table; tsk_size_t num_nodes; tsk_id_t c; num_nodes = tsk_treeseq_get_num_nodes(ts); edge_table = ts->tables->edges; ret = tsk_tree_init(&t, ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* verify initialized edge array */ for (c = 0; c <= (tsk_id_t) num_nodes; c++) { CU_ASSERT_EQUAL(t.edge[c], TSK_NULL) } /* verify edge array for each tree in treesequence */ for (ret = tsk_tree_first(&t); ret == TSK_TREE_OK; ret = tsk_tree_next(&t)) { verify_edge_array_single_tree(&t, &edge_table, num_nodes); } CU_ASSERT_EQUAL_FATAL(ret, 0); /* verify cleared edge array */ for (c = 0; c <= (tsk_id_t) num_nodes; c++) { CU_ASSERT_EQUAL(t.edge[c], TSK_NULL) } tsk_tree_free(&t); } /* When we keep all sites in simplify, the genotypes for the subset of the * samples should be the same as the original */ static void verify_simplify_genotypes(tsk_treeseq_t *ts, tsk_treeseq_t *subset, const tsk_id_t *samples, tsk_size_t num_samples) { int ret; tsk_size_t m = tsk_treeseq_get_num_sites(ts); tsk_vargen_t vargen, subset_vargen; tsk_variant_t *variant, *subset_variant; tsk_size_t j, k; int32_t a1, a2; const tsk_id_t *sample_index_map; sample_index_map = tsk_treeseq_get_sample_index_map(ts); /* tsk_treeseq_print_state(ts, stdout); */ /* tsk_treeseq_print_state(subset, stdout); */ ret = tsk_vargen_init(&vargen, ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_init( &subset_vargen, subset, NULL, 0, NULL, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(m, tsk_treeseq_get_num_sites(subset)); for (j = 0; j < m; j++) { ret = tsk_vargen_next(&vargen, &variant); CU_ASSERT_EQUAL_FATAL(ret, 1); ret = tsk_vargen_next(&subset_vargen, &subset_variant); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(variant->site.id, (tsk_id_t) j) CU_ASSERT_EQUAL(subset_variant->site.id, (tsk_id_t) j) CU_ASSERT_EQUAL(variant->site.position, subset_variant->site.position); for (k = 0; k < num_samples; k++) { CU_ASSERT_FATAL(sample_index_map[samples[k]] < (tsk_id_t) ts->num_samples); a1 = variant->genotypes[sample_index_map[samples[k]]]; a2 = subset_variant->genotypes[k]; /* printf("a1 = %d, a2 = %d\n", a1, a2); */ /* printf("k = %d original node = %d " */ /* "original_index = %d a1=%.*s a2=%.*s\n", */ /* (int) k, samples[k], sample_index_map[samples[k]], */ /* variant->allele_lengths[a1], variant->alleles[a1], */ /* subset_variant->allele_lengths[a2], subset_variant->alleles[a2]); */ CU_ASSERT_FATAL(a1 < (int) variant->num_alleles); CU_ASSERT_FATAL(a2 < (int) subset_variant->num_alleles); CU_ASSERT_EQUAL_FATAL( variant->allele_lengths[a1], subset_variant->allele_lengths[a2]); CU_ASSERT_NSTRING_EQUAL_FATAL(variant->alleles[a1], subset_variant->alleles[a2], variant->allele_lengths[a1]); } } tsk_vargen_free(&vargen); tsk_vargen_free(&subset_vargen); } static void verify_simplify_properties(tsk_treeseq_t *ts, tsk_treeseq_t *subset, const tsk_id_t *samples, tsk_size_t num_samples, tsk_id_t *node_map) { int ret; tsk_node_t n1, n2; tsk_tree_t full_tree, subset_tree; const tsk_site_t *tree_sites; tsk_size_t tree_sites_length; uint32_t j, k; tsk_id_t u, mrca1, mrca2; tsk_size_t total_sites; CU_ASSERT_EQUAL( tsk_treeseq_get_sequence_length(ts), tsk_treeseq_get_sequence_length(subset)); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(subset), num_samples); CU_ASSERT(tsk_treeseq_get_num_nodes(ts) >= tsk_treeseq_get_num_nodes(subset)); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(subset), num_samples); /* Check the sample properties */ for (j = 0; j < num_samples; j++) { ret = tsk_treeseq_get_node(ts, samples[j], &n1); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(node_map[samples[j]], (tsk_id_t) j); ret = tsk_treeseq_get_node(subset, node_map[samples[j]], &n2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n1.population, n2.population); CU_ASSERT_EQUAL_FATAL(n1.time, n2.time); CU_ASSERT_EQUAL_FATAL(n1.flags, n2.flags); CU_ASSERT_EQUAL_FATAL(n1.metadata_length, n2.metadata_length); CU_ASSERT_NSTRING_EQUAL(n1.metadata, n2.metadata, n2.metadata_length); } /* Check that node mappings are correct */ for (j = 0; j < tsk_treeseq_get_num_nodes(ts); j++) { ret = tsk_treeseq_get_node(ts, (tsk_id_t) j, &n1); CU_ASSERT_EQUAL_FATAL(ret, 0); if (node_map[j] != TSK_NULL) { ret = tsk_treeseq_get_node(subset, node_map[j], &n2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n1.population, n2.population); CU_ASSERT_EQUAL_FATAL(n1.time, n2.time); CU_ASSERT_EQUAL_FATAL(n1.flags, n2.flags); CU_ASSERT_EQUAL_FATAL(n1.metadata_length, n2.metadata_length); CU_ASSERT_NSTRING_EQUAL(n1.metadata, n2.metadata, n2.metadata_length); } } if (num_samples == 0) { CU_ASSERT_EQUAL(tsk_treeseq_get_num_edges(subset), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(subset), 0); } else if (num_samples == 1) { CU_ASSERT_EQUAL(tsk_treeseq_get_num_edges(subset), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(subset), 1); } /* Check the pairwise MRCAs */ ret = tsk_tree_init(&full_tree, ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_init(&subset_tree, subset, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&full_tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); ret = tsk_tree_first(&subset_tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); total_sites = 0; while (1) { while (full_tree.interval.right <= subset_tree.interval.right) { for (j = 0; j < num_samples; j++) { for (k = j + 1; k < num_samples; k++) { ret = tsk_tree_get_mrca(&full_tree, samples[j], samples[k], &mrca1); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_get_mrca(&subset_tree, node_map[samples[j]], node_map[samples[k]], &mrca2); CU_ASSERT_EQUAL_FATAL(ret, 0); if (mrca1 == TSK_NULL) { CU_ASSERT_EQUAL_FATAL(mrca2, TSK_NULL); } else { CU_ASSERT_EQUAL(node_map[mrca1], mrca2); } } } ret = tsk_tree_next(&full_tree); CU_ASSERT_FATAL(ret >= 0); if (ret != 1) { break; } } /* Check the sites in this tree */ ret = tsk_tree_get_sites(&subset_tree, &tree_sites, &tree_sites_length); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < tree_sites_length; j++) { CU_ASSERT(subset_tree.interval.left <= tree_sites[j].position); CU_ASSERT(tree_sites[j].position < subset_tree.interval.right); for (k = 0; k < tree_sites[j].mutations_length; k++) { ret = tsk_tree_get_parent( &subset_tree, tree_sites[j].mutations[k].node, &u); CU_ASSERT_EQUAL(ret, 0); } total_sites++; } ret = tsk_tree_next(&subset_tree); if (ret != 1) { break; } } CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(subset), total_sites); tsk_tree_free(&subset_tree); tsk_tree_free(&full_tree); } static void verify_simplify(tsk_treeseq_t *ts) { int ret; tsk_size_t n = tsk_treeseq_get_num_samples(ts); tsk_size_t num_samples[] = { 0, 1, 2, 3, n / 2, n - 1, n }; tsk_size_t j; const tsk_id_t *sample; tsk_id_t *node_map = tsk_malloc(tsk_treeseq_get_num_nodes(ts) * sizeof(tsk_id_t)); tsk_treeseq_t subset; tsk_flags_t options = TSK_SIMPLIFY_FILTER_SITES; CU_ASSERT_FATAL(node_map != NULL); sample = tsk_treeseq_get_samples(ts); if (tsk_treeseq_get_num_migrations(ts) > 0) { ret = tsk_treeseq_simplify(ts, sample, 2, 0, &subset, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED); /* Exiting early here because simplify isn't supported with migrations. */ goto out; } for (j = 0; j < sizeof(num_samples) / sizeof(*num_samples); j++) { if (num_samples[j] <= n) { ret = tsk_treeseq_simplify( ts, sample, num_samples[j], options, &subset, node_map); /* printf("ret = %s\n", tsk_strerror(ret)); */ CU_ASSERT_EQUAL_FATAL(ret, 0); verify_simplify_properties(ts, &subset, sample, num_samples[j], node_map); tsk_treeseq_free(&subset); /* Keep all sites */ ret = tsk_treeseq_simplify(ts, sample, num_samples[j], 0, &subset, node_map); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_simplify_properties(ts, &subset, sample, num_samples[j], node_map); verify_simplify_genotypes(ts, &subset, sample, num_samples[j]); tsk_treeseq_free(&subset); } } out: free(node_map); } typedef struct { tsk_id_t tree_index; tsk_id_t node; tsk_size_t count; } sample_count_test_t; static void verify_sample_counts(tsk_treeseq_t *ts, tsk_size_t num_tests, sample_count_test_t *tests, tsk_flags_t seek_options) { int ret; tsk_size_t j, num_samples, n, k; tsk_id_t stop, sample_index; tsk_tree_t tree; const tsk_id_t *samples; n = tsk_treeseq_get_num_samples(ts); samples = tsk_treeseq_get_samples(ts); /* First run with the TSK_NO_SAMPLE_COUNTS feature */ ret = tsk_tree_init(&tree, ts, TSK_NO_SAMPLE_COUNTS); CU_ASSERT_EQUAL(ret, 0); for (j = 0; j < num_tests; j++) { ret = tsk_tree_seek_index(&tree, tests[j].tree_index, seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_get_num_samples(&tree, tests[j].node, &num_samples); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tests[j].count, num_samples); /* all operations depending on tracked samples should fail. */ ret = tsk_tree_get_num_tracked_samples(&tree, 0, &num_samples); CU_ASSERT_EQUAL(ret, TSK_ERR_UNSUPPORTED_OPERATION); /* The root should be NULL */ CU_ASSERT_EQUAL(tsk_tree_get_left_root(&tree), TSK_NULL); } tsk_tree_free(&tree); /* Now run with TSK_SAMPLE_COUNTS but with no samples tracked. */ ret = tsk_tree_init(&tree, ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); for (j = 0; j < num_tests; j++) { ret = tsk_tree_seek_index(&tree, tests[j].tree_index, seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_get_num_samples(&tree, tests[j].node, &num_samples); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tests[j].count, num_samples); /* all operations depending on tracked samples should fail. */ ret = tsk_tree_get_num_tracked_samples(&tree, 0, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 0); /* The root should not be NULL */ CU_ASSERT_NOT_EQUAL(tree.virtual_root, TSK_NULL); } tsk_tree_free(&tree); /* Run with TSK_SAMPLE_LISTS and TSK_NO_SAMPLE_COUNTS */ ret = tsk_tree_init(&tree, ts, TSK_SAMPLE_LISTS | TSK_NO_SAMPLE_COUNTS); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); for (j = 0; j < num_tests; j++) { ret = tsk_tree_seek_index(&tree, tests[j].tree_index, seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_get_num_samples(&tree, tests[j].node, &num_samples); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tests[j].count, num_samples); /* all operations depending on tracked samples should fail. */ ret = tsk_tree_get_num_tracked_samples(&tree, 0, &num_samples); CU_ASSERT_EQUAL(ret, TSK_ERR_UNSUPPORTED_OPERATION); sample_index = tree.left_sample[tests[j].node]; k = 0; if (sample_index != TSK_NULL) { stop = tree.right_sample[tests[j].node]; while (true) { k++; CU_ASSERT_FATAL(k <= tests[j].count); if (sample_index == stop) { break; } sample_index = tree.next_sample[sample_index]; } } CU_ASSERT_EQUAL(tests[j].count, k); } tsk_tree_free(&tree); /* Now use TSK_SAMPLE_LISTS */ ret = tsk_tree_init(&tree, ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_set_tracked_samples(&tree, n, samples); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); for (j = 0; j < num_tests; j++) { ret = tsk_tree_seek_index(&tree, tests[j].tree_index, seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_get_num_samples(&tree, tests[j].node, &num_samples); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tests[j].count, num_samples); /* We're tracking all samples, so the count should be the same */ ret = tsk_tree_get_num_tracked_samples(&tree, tests[j].node, &num_samples); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tests[j].count, num_samples); sample_index = tree.left_sample[tests[j].node]; k = 0; if (sample_index != TSK_NULL) { stop = tree.right_sample[tests[j].node]; while (true) { k++; if (sample_index == stop) { break; } sample_index = tree.next_sample[sample_index]; } } CU_ASSERT_EQUAL(tests[j].count, k); } tsk_tree_free(&tree); } static void verify_sample_sets_for_tree(tsk_tree_t *tree) { int ret, stack_top, j; tsk_id_t u, v; tsk_size_t tmp, n, num_nodes, num_samples; tsk_id_t *stack, *samples; const tsk_treeseq_t *ts = tree->tree_sequence; tsk_id_t *sample_index_map = ts->sample_index_map; const tsk_id_t *list_left = tree->left_sample; const tsk_id_t *list_right = tree->right_sample; const tsk_id_t *list_next = tree->next_sample; tsk_id_t stop, sample_index; n = tsk_treeseq_get_num_samples(ts); num_nodes = tsk_treeseq_get_num_nodes(ts); stack = tsk_malloc(n * sizeof(tsk_id_t)); samples = tsk_malloc(n * sizeof(tsk_id_t)); CU_ASSERT_FATAL(stack != NULL); CU_ASSERT_FATAL(samples != NULL); for (u = 0; u < (tsk_id_t) num_nodes; u++) { if (tree->left_child[u] == TSK_NULL && !tsk_treeseq_is_sample(ts, u)) { CU_ASSERT_EQUAL(list_left[u], TSK_NULL); CU_ASSERT_EQUAL(list_right[u], TSK_NULL); } else { stack_top = 0; num_samples = 0; stack[stack_top] = u; while (stack_top >= 0) { v = stack[stack_top]; stack_top--; if (tsk_treeseq_is_sample(ts, v)) { samples[num_samples] = v; num_samples++; } for (v = tree->right_child[v]; v != TSK_NULL; v = tree->left_sib[v]) { stack_top++; stack[stack_top] = v; } } ret = tsk_tree_get_num_samples(tree, u, &tmp); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_samples, tmp); j = 0; sample_index = list_left[u]; if (sample_index != TSK_NULL) { stop = list_right[u]; while (true) { CU_ASSERT_TRUE_FATAL(j < (tsk_id_t) n); CU_ASSERT_EQUAL_FATAL(sample_index, sample_index_map[samples[j]]); j++; if (sample_index == stop) { break; } sample_index = list_next[sample_index]; } } CU_ASSERT_EQUAL_FATAL(j, (int) num_samples); } } free(stack); free(samples); } static void verify_sample_sets(tsk_treeseq_t *ts) { int ret; tsk_tree_t t; tsk_id_t j; ret = tsk_tree_init(&t, ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL(ret, 0); for (ret = tsk_tree_first(&t); ret == TSK_TREE_OK; ret = tsk_tree_next(&t)) { verify_sample_sets_for_tree(&t); } CU_ASSERT_EQUAL_FATAL(ret, 0); for (ret = tsk_tree_last(&t); ret == TSK_TREE_OK; ret = tsk_tree_prev(&t)) { verify_sample_sets_for_tree(&t); } CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < (tsk_id_t) tsk_treeseq_get_num_trees(ts); j++) { ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_seek_index(&t, j, TSK_SEEK_SKIP); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_sample_sets_for_tree(&t); ret = tsk_tree_last(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_seek_index(&t, j, TSK_SEEK_SKIP); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_sample_sets_for_tree(&t); } tsk_tree_free(&t); } static void verify_empty_tree_sequence(tsk_treeseq_t *ts, double sequence_length) { CU_ASSERT_EQUAL(tsk_treeseq_get_num_edges(ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_migrations(ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(ts), sequence_length); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(ts), 1); } /*======================================================= * Simplest test cases. *======================================================*/ static void test_simplest_discrete_genome(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_id_t ret_id; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts)); ret = tsk_table_collection_copy(ts.tables, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); tables.sequence_length = 1.001; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts)); tsk_treeseq_free(&ts); tables.sequence_length = 1; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts)); tsk_treeseq_free(&ts); tables.edges.right[0] = 0.999; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts)); tsk_treeseq_free(&ts); tables.edges.right[0] = 1.0; tables.edges.left[0] = 0.999; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts)); tsk_treeseq_free(&ts); tables.edges.left[0] = 0; ret_id = tsk_site_table_add_row(&tables.sites, 0, "A", 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts)); tsk_treeseq_free(&ts); tables.sites.position[0] = 0.001; CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts)); tsk_treeseq_free(&ts); tables.sites.position[0] = 0; /* Need another population for a migration */ ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 1, 1.0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts)); tsk_treeseq_free(&ts); tables.migrations.left[0] = 0.001; CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts)); tsk_treeseq_free(&ts); tables.migrations.left[0] = 0; tables.migrations.right[0] = 0.999; CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts)); tsk_treeseq_free(&ts); tables.migrations.right[0] = 1; /* An empty tree sequence is has a discrete genome. */ tsk_table_collection_clear(&tables, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts)); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_simplest_discrete_time(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 0 0\n" "0 0 0"; const char *edges = "0 1 2 0,1,3,4\n"; const char *sites = "0.1 0\n" "0.2 0\n" "0.3 0\n" "0.4 0\n"; const char *mutations = "0 0 1\n" "1 1 1\n" "2 3 1\n" "3 4 1"; const char *migrations = "0 1 0 0 1 1"; tsk_treeseq_from_text( &ts, 1, nodes, edges, migrations, sites, mutations, NULL, NULL, 0); CU_ASSERT_TRUE(tsk_treeseq_get_discrete_time(&ts)); ret = tsk_table_collection_copy(ts.tables, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_treeseq_get_discrete_time(&ts)); tsk_treeseq_free(&ts); tables.nodes.time[0] = 0.0001; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_treeseq_get_discrete_time(&ts)); tsk_treeseq_free(&ts); tables.nodes.time[0] = 0; tables.mutations.time[0] = 0.001; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_treeseq_get_discrete_time(&ts)); tsk_treeseq_free(&ts); tables.mutations.time[0] = 0; tables.migrations.time[0] = 0.001; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_treeseq_get_discrete_time(&ts)); tsk_treeseq_free(&ts); tables.migrations.time[0] = 0; tables.mutations.time[0] = TSK_UNKNOWN_TIME; tables.mutations.time[1] = TSK_UNKNOWN_TIME; tables.mutations.time[2] = TSK_UNKNOWN_TIME; tables.mutations.time[3] = TSK_UNKNOWN_TIME; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_treeseq_get_discrete_time(&ts)); tsk_treeseq_free(&ts); /* An empty tree sequence is has a discrete time. */ tsk_table_collection_clear(&tables, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_treeseq_get_discrete_time(&ts)); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_simplest_min_time(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; const char *nodes = "1 0.1 0 -1\n" "1 0.1 0 -1\n" "1 0.1 0 -1\n" "0 1 0 -1\n" "0 2 0 -1\n"; const char *edges = "0 2 3 0,1\n" "0 2 4 2,3\n"; const char *sites = "0 0\n" "1 0\n"; const char *mutations = "0 2 1 -1 0.5\n" "1 3 1 -1 1.5\n"; tsk_treeseq_from_text(&ts, 2, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_DOUBLE_EQUAL(tsk_treeseq_get_min_time(&ts), 0.1, 1E-6); ret = tsk_table_collection_copy(ts.tables, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL(tsk_treeseq_get_min_time(&ts), 0.1, 1E-6); tsk_treeseq_free(&ts); /* Setting mutation times to unknown should have no effect on min time. */ tables.mutations.time[0] = TSK_UNKNOWN_TIME; tables.mutations.time[1] = TSK_UNKNOWN_TIME; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL(tsk_treeseq_get_min_time(&ts), 0.1, 1E-6); tsk_treeseq_free(&ts); tables.mutations.time[0] = 0.5; tables.mutations.time[1] = 1.5; /* An empty tree sequence has infinity min time. */ tsk_table_collection_clear(&tables, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_min_time(&ts), INFINITY); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_simplest_max_time(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t tables; const char *nodes = "1 0.1 0 -1\n" "1 0.1 0 -1\n" "1 0.1 0 -1\n" "0 1 0 -1\n" "0 2 0 -1\n"; const char *edges = "0 2 3 0,1\n" "0 2 4 2,3\n"; const char *sites = "0 0\n" "1 0\n"; const char *mutations = "0 2 1 -1 0.5\n" "1 3 1 -1 1.5\n"; tsk_treeseq_from_text(&ts, 2, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_DOUBLE_EQUAL(tsk_treeseq_get_max_time(&ts), 2.0, 1E-6); ret = tsk_table_collection_copy(ts.tables, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL(tsk_treeseq_get_max_time(&ts), 2.0, 1E-6); tsk_treeseq_free(&ts); /* Setting mutation times to unknown should have no effect on max time. */ tables.mutations.time[0] = TSK_UNKNOWN_TIME; tables.mutations.time[1] = TSK_UNKNOWN_TIME; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL(tsk_treeseq_get_max_time(&ts), 2.0, 1E-6); tsk_treeseq_free(&ts); tables.mutations.time[0] = 0.5; tables.mutations.time[1] = 1.5; /* An empty tree sequence has negative infinity max time. */ tsk_table_collection_clear(&tables, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_max_time(&ts), -INFINITY); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_simplest_records(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts, simplified; tsk_id_t sample_ids[] = { 0, 1 }; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, 0, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, TSK_SIMPLIFY_KEEP_UNARY | TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_KEEP_UNARY, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); } static void test_simplest_nonbinary_records(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges = "0 1 4 0,1,2,3\n"; tsk_treeseq_t ts, simplified; tsk_tree_t t; tsk_id_t sample_ids[] = { 0, 1, 2, 3 }; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 5); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL(t.num_children[4], 4); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&t), 1); tsk_tree_free(&t); ret = tsk_treeseq_simplify(&ts, sample_ids, 4, 0, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 4, TSK_SIMPLIFY_KEEP_UNARY, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 4, TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); } static void test_simplest_unary_records(void) { int ret; const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 1 0\n" "0 2 0"; const char *edges = "0 1 2 0\n" "0 1 3 1\n" "0 1 4 2,3\n"; tsk_treeseq_t ts, simplified, simplified_other; tsk_tree_t t; tsk_id_t sample_ids[] = { 0, 1 }; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 5); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_num_populations(&ts), 1); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL(t.num_children[2], 1); CU_ASSERT_EQUAL(t.num_children[4], 2); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&t), 1); tsk_tree_free(&t); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, 0, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&simplified), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&simplified), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&simplified), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_edges(&simplified), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&simplified), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&simplified), 1); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_KEEP_UNARY, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, 0, &simplified_other, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE( tsk_table_collection_equals(simplified.tables, simplified_other.tables, 0)); tsk_treeseq_free(&simplified); tsk_treeseq_free(&simplified_other); tsk_treeseq_free(&ts); } static void test_simplest_unary_with_individuals(void) { int ret; const char *nodes = "1 0 0 -1\n" "1 0 0 0\n" "0 1 0 -1\n" "0 1 0 1\n" "0 2 0 -1\n" "0 3 0 -1\n" "0 3 0 2\n" "0 1 0 -1\n" "0 1 0 3\n" "0 0 0 -1\n" "0 0 0 4\n" "0 1 0 3\n"; const char *edges = "0 2 2 0\n" "0 2 3 1\n" "2 3 7 0\n" "2 3 8 1,9\n" "2 3 11 10\n" "0 2 4 2,3\n" "0 1 5 4\n" "1 2 6 4\n"; const char *individuals = "0 0.5 -1,-1\n" "0 1.5,3.1 -1,-1\n" "0 2.1 0,1\n" "0 3.2 1,2\n" "0 4.2 2,3\n"; const char *nodes_expect = "1 0 0 -1\n" "1 0 0 0\n" "0 1 0 1\n" "0 1 0 3\n" "0 2 0 -1\n" "0 3 0 2\n"; const char *edges_expect = "0 2 2 1\n" "2 3 3 1\n" "0 2 4 0,2\n" "1 2 5 4\n"; const char *individuals_expect = "0 0.5 -1,-1\n" "0 1.5,3.1 -1,-1\n" "0 2.1 0,1\n" "0 3.2 1,2\n"; tsk_treeseq_t ts, simplified, expected; tsk_id_t sample_ids[] = { 0, 1 }; tsk_treeseq_from_text(&ts, 3, nodes, edges, NULL, NULL, NULL, individuals, NULL, 0); tsk_treeseq_from_text(&expected, 3, nodes_expect, edges_expect, NULL, NULL, NULL, individuals_expect, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 3.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 12); CU_ASSERT_EQUAL(tsk_treeseq_get_num_individuals(&ts), 5); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_populations(&ts), 1); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS | TSK_SIMPLIFY_FILTER_INDIVIDUALS, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(simplified.tables, expected.tables, 0)); tsk_treeseq_free(&simplified); tsk_treeseq_free(&expected); tsk_treeseq_free(&ts); } static void test_simplest_non_sample_leaf_records(void) { int ret; const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 0 0\n" "0 0 0"; const char *edges = "0 1 2 0,1,3,4\n"; const char *sites = "0.1 0\n" "0.2 0\n" "0.3 0\n" "0.4 0\n"; const char *mutations = "0 0 1\n" "1 1 1\n" "2 3 1\n" "3 4 1"; tsk_treeseq_t ts, simplified; tsk_id_t sample_ids[] = { 0, 1 }; tsk_vargen_t vargen; tsk_variant_t *var; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 5); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); tsk_vargen_print_state(&vargen, _devnull); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->genotypes[0], 1); CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 1); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_vargen_free(&vargen); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, 0, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&simplified), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&simplified), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&simplified), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&simplified), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&simplified), 1); tsk_treeseq_free(&ts); tsk_treeseq_free(&simplified); } static void test_simplest_degenerate_multiple_root_records(void) { int ret; const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 1 0\n"; const char *edges = "0 1 2 0\n" "0 1 3 1\n"; tsk_treeseq_t ts, simplified; tsk_tree_t t; tsk_id_t sample_ids[] = { 0, 1 }; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&t), 2); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&t), 2); CU_ASSERT_EQUAL(tsk_tree_get_right_root(&t), 3); CU_ASSERT_EQUAL(t.num_edges, 2); CU_ASSERT_EQUAL(t.right_sib[2], 3); CU_ASSERT_EQUAL(t.right_sib[3], TSK_NULL); CU_ASSERT_EQUAL(t.num_children[2], 1); CU_ASSERT_EQUAL(t.num_children[0], 0); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, 0, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&simplified), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&simplified), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&simplified), 2); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_KEEP_UNARY, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_simplest_multiple_root_records(void) { int ret; const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 1 0\n"; const char *edges = "0 1 4 0,1\n" "0 1 5 2,3\n"; tsk_treeseq_t ts, simplified; tsk_id_t sample_ids[] = { 0, 1, 2, 3 }; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 6); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); ret = tsk_treeseq_simplify(&ts, sample_ids, 4, 0, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&simplified), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&simplified), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&simplified), 6); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&simplified), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&simplified), 1); tsk_treeseq_free(&simplified); /* Make one tree degenerate */ ret = tsk_treeseq_simplify(&ts, sample_ids, 3, 0, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&simplified), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&simplified), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&simplified), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&simplified), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&simplified), 1); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); } static void test_simplest_zero_root_tree(void) { int ret; const char *nodes = "0 0 0\n" "0 0 0\n" "0 0 0\n" "0 0 0\n" "0 1 0\n" "0 1 0\n"; const char *edges = "0 1 4 0,1\n" "0 1 5 2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 6); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&t), 0); CU_ASSERT_EQUAL(t.num_edges, 4); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&t), TSK_NULL); CU_ASSERT_EQUAL(tsk_tree_get_right_root(&t), TSK_NULL); CU_ASSERT_EQUAL(t.right_sib[2], 3); CU_ASSERT_EQUAL(t.right_sib[3], TSK_NULL); CU_ASSERT_EQUAL(t.num_children[0], 0); CU_ASSERT_EQUAL(t.num_children[4], 2); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_simplest_multi_root_tree(void) { int ret; const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0\n"; const char *edges = "0 1 3 1,2\n"; tsk_treeseq_t ts; tsk_tree_t t; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); ret = tsk_tree_init(&t, &ts, 0); tsk_tree_print_state(&t, _devnull); /* Make sure the initial roots are set correctly */ CU_ASSERT_EQUAL(tsk_tree_get_left_root(&t), 0); CU_ASSERT_EQUAL(t.left_sib[0], TSK_NULL); CU_ASSERT_EQUAL(t.right_sib[0], 1); CU_ASSERT_EQUAL(t.left_sib[1], 0); CU_ASSERT_EQUAL(t.right_sib[1], 2); CU_ASSERT_EQUAL(t.left_sib[2], 1); CU_ASSERT_EQUAL(t.right_sib[2], TSK_NULL); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&t), 2); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&t), 0); CU_ASSERT_EQUAL(t.right_sib[0], 3); CU_ASSERT_EQUAL(t.num_edges, 2); CU_ASSERT_EQUAL(t.num_children[0], 0); CU_ASSERT_EQUAL(t.num_children[3], 2); tsk_tree_print_state(&t, _devnull); CU_ASSERT_EQUAL(tsk_tree_set_root_threshold(&t, 1), TSK_ERR_UNSUPPORTED_OPERATION); ret = tsk_tree_next(&t); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_tree_set_root_threshold(&t, 0), TSK_ERR_BAD_PARAM_VALUE); ret = tsk_tree_set_root_threshold(&t, 2); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_tree_get_root_threshold(&t), 2); ret = tsk_tree_next(&t); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&t), 1); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&t), 3); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_simplest_tree_mrca(void) { int ret; tsk_table_collection_t tables; tsk_treeseq_t ts; tsk_tree_t t; tsk_id_t mrca, ret_id; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_get_mrca(&t, 0, 0, &mrca); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(mrca, 0); tsk_tree_free(&t); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_simplest_root_mutations(void) { int ret; const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n"; const char *edges = "0 1 2 0,1\n"; const char *sites = "0.1 0"; const char *mutations = "0 2 1"; tsk_flags_t options = 0; tsk_id_t sample_ids[] = { 0, 1 }; tsk_treeseq_t ts, simplified; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, options, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&simplified), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&simplified), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&simplified), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&simplified), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&simplified), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&simplified), 1); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); } static void test_simplest_back_mutations(void) { int ret; const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 2 0\n"; const char *edges = "0 1 3 0,1\n" "0 1 4 2,3\n"; const char *sites = "0.5 0"; const char *mutations = "0 3 1 -1\n" "0 0 0 0"; tsk_treeseq_t ts; tsk_vargen_t vargen; tsk_variant_t *var; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 5); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); ret = tsk_vargen_init(&vargen, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_next(&vargen, &var); CU_ASSERT_EQUAL_FATAL(ret, 1); CU_ASSERT_EQUAL(var->num_alleles, 2); CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); CU_ASSERT_EQUAL(var->genotypes[0], 0); CU_ASSERT_EQUAL(var->genotypes[1], 1); CU_ASSERT_EQUAL(var->genotypes[2], 0); CU_ASSERT_EQUAL(var->site.id, 0); CU_ASSERT_EQUAL(var->site.mutations_length, 2); tsk_vargen_free(&vargen); tsk_treeseq_free(&ts); } static void test_simplest_general_samples(void) { const char *nodes = "1 0 0\n" "0 1 0\n" "1 0 0"; const char *edges = "0 1 1 0,2\n"; const char *sites = "0.5 0\n" "0.75 0\n"; const char *mutations = "0 2 1\n" "1 0 1"; const tsk_id_t samples[2] = { 0, 2 }; const tsk_id_t *s; int ret; tsk_treeseq_t ts, simplified; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); s = tsk_treeseq_get_samples(&ts); CU_ASSERT_FATAL(s != NULL); CU_ASSERT_EQUAL(s[0], 0); CU_ASSERT_EQUAL(s[1], 2); ret = tsk_treeseq_simplify(&ts, samples, 2, 0, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); s = tsk_treeseq_get_samples(&simplified); CU_ASSERT_FATAL(s != NULL); CU_ASSERT_EQUAL(s[0], 0); CU_ASSERT_EQUAL(s[1], 1); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); } static void test_simplest_holey_tree_sequence(void) { const char *nodes_txt = "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges_txt = "0 1 2 0\n" "2 3 2 0\n" "0 1 2 1\n" "2 3 2 1\n"; const char *sites_txt = "0.5 0\n" "1.5 0\n" "2.5 0\n"; const char *mutations_txt = "0 0 1\n" "1 1 1\n" "2 2 1\n"; int ret; tsk_treeseq_t ts, simplified; tsk_id_t sample_ids[] = { 0, 1 }; tsk_treeseq_from_text( &ts, 3, nodes_txt, edges_txt, NULL, sites_txt, mutations_txt, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 3.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 3); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, 0, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_KEEP_UNARY, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); } static void test_simplest_holey_tsk_treeseq_mutation_parents(void) { const char *nodes_txt = "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges_txt = "0 1 2 0\n" "2 3 2 0\n" "0 1 2 1\n" "2 3 2 1\n"; const char *sites_txt = "0.5 0\n" "1.5 0\n" "2.5 0\n"; const char *mutations_txt = "0 0 1\n" "0 0 1\n" "1 1 1\n" "1 1 1\n" "2 2 1\n" "2 2 1\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; int ret; tsk_treeseq_from_text( &ts, 3, nodes_txt, edges_txt, NULL, sites_txt, mutations_txt, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 6); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 3); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.mutations.parent[0], -1); CU_ASSERT_EQUAL(tables.mutations.parent[1], 0); CU_ASSERT_EQUAL(tables.mutations.parent[2], -1); CU_ASSERT_EQUAL(tables.mutations.parent[3], 2); CU_ASSERT_EQUAL(tables.mutations.parent[4], -1); CU_ASSERT_EQUAL(tables.mutations.parent[5], 4); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_simplest_initial_gap_tree_sequence(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges = "2 3 2 0,1\n"; const char *sites = "0.5 0\n" "1.5 0\n" "2.5 0\n"; const char *mutations = "0 0 1\n" "1 1 1\n" "2 2 1"; int ret; tsk_treeseq_t ts, simplified; const tsk_id_t z = TSK_NULL; tsk_id_t parents[] = { z, z, z, 2, 2, z, }; tsk_size_t num_trees = 2; tsk_id_t sample_ids[] = { 0, 1 }; tsk_treeseq_from_text(&ts, 3, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 3.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 2); verify_trees(&ts, num_trees, parents); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, 0, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_KEEP_UNARY, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); } static void test_simplest_initial_gap_zero_roots(void) { const char *nodes = "0 0 0\n" "0 0 0\n" "0 1 0"; const char *edges = "2 3 2 0,1\n"; int ret; tsk_treeseq_t ts; const tsk_id_t z = TSK_NULL; tsk_id_t parents[] = { z, z, z, 2, 2, z, }; uint32_t num_trees = 2; tsk_tree_t tree; tsk_treeseq_from_text(&ts, 3, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 3.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 2); verify_trees(&ts, num_trees, parents); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&tree), TSK_NULL); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&tree), 0); ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&tree), TSK_NULL); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&tree), 0); CU_ASSERT_EQUAL(tree.parent[0], 2); CU_ASSERT_EQUAL(tree.parent[1], 2); CU_ASSERT_EQUAL(tree.num_children[2], 2); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_simplest_holey_tsk_treeseq_zero_roots(void) { const char *nodes_txt = "0 0 0\n" "0 0 0\n" "0 1 0"; const char *edges_txt = "0 1 2 0\n" "2 3 2 0\n" "0 1 2 1\n" "2 3 2 1\n"; int ret; tsk_treeseq_t ts; const tsk_id_t z = TSK_NULL; tsk_id_t parents[] = { 2, 2, z, z, z, z, 2, 2, z, }; uint32_t num_trees = 3; tsk_tree_t tree; tsk_treeseq_from_text(&ts, 3, nodes_txt, edges_txt, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 3.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 3); verify_trees(&ts, num_trees, parents); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&tree), TSK_NULL); CU_ASSERT_EQUAL(tree.parent[0], 2); CU_ASSERT_EQUAL(tree.parent[1], 2); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&tree), 0); CU_ASSERT_EQUAL(tree.num_children[2], 2); ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&tree), TSK_NULL); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&tree), 0); CU_ASSERT_EQUAL(tree.num_children[2], 0); ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&tree), TSK_NULL); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&tree), 0); CU_ASSERT_EQUAL(tree.parent[0], 2); CU_ASSERT_EQUAL(tree.parent[1], 2); CU_ASSERT_EQUAL(tree.num_children[2], 2); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_simplest_initial_gap_tsk_treeseq_mutation_parents(void) { const char *nodes_txt = "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges_txt = "2 3 2 0,1\n"; const char *sites_txt = "0.5 0\n" "1.5 0\n" "2.5 0\n"; const char *mutations_txt = "0 0 1\n" "0 0 1\n" "1 1 1\n" "1 1 1\n" "2 2 1\n" "2 2 1\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; int ret; tsk_treeseq_from_text( &ts, 3, nodes_txt, edges_txt, NULL, sites_txt, mutations_txt, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 6); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 2); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.mutations.parent[0], -1); CU_ASSERT_EQUAL(tables.mutations.parent[1], 0); CU_ASSERT_EQUAL(tables.mutations.parent[2], -1); CU_ASSERT_EQUAL(tables.mutations.parent[3], 2); CU_ASSERT_EQUAL(tables.mutations.parent[4], -1); CU_ASSERT_EQUAL(tables.mutations.parent[5], 4); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_simplest_final_gap_tree_sequence(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges = "0 2 2 0,1\n"; const char *sites = "0.5 0\n" "1.5 0\n" "2.5 0\n"; const char *mutations = "0 0 1\n" "1 1 1\n" "2 0 1"; tsk_treeseq_t ts; const tsk_id_t z = TSK_NULL; tsk_id_t parents[] = { 2, 2, z, z, z, z, }; uint32_t num_trees = 2; tsk_treeseq_from_text(&ts, 3, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 3.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 2); verify_trees(&ts, num_trees, parents); tsk_treeseq_free(&ts); } static void test_simplest_final_gap_tsk_treeseq_mutation_parents(void) { const char *nodes_txt = "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges_txt = "0 2 2 0,1\n"; const char *sites_txt = "0.5 0\n" "1.5 0\n" "2.5 0\n"; const char *mutations_txt = "0 0 1\n" "0 0 1\n" "1 1 1\n" "1 1 1\n" "2 0 1\n" "2 0 1\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; int ret; tsk_treeseq_from_text( &ts, 3, nodes_txt, edges_txt, NULL, sites_txt, mutations_txt, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 6); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 2); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.mutations.parent[0], -1); CU_ASSERT_EQUAL(tables.mutations.parent[1], 0); CU_ASSERT_EQUAL(tables.mutations.parent[2], -1); CU_ASSERT_EQUAL(tables.mutations.parent[3], 2); CU_ASSERT_EQUAL(tables.mutations.parent[4], -1); CU_ASSERT_EQUAL(tables.mutations.parent[5], 4); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_simplest_individuals(void) { const char *individuals = "1 0.25 -1,-1\n" "2 0.5,0.25 -1,-1\n" "3 0.75 0,1\n"; const char *nodes = "1 0 -1 -1\n" "1 0 -1 1\n" "0 0 -1 -1\n" "1 0 -1 0\n" "0 0 -1 1\n" "0 0 -1 2\n"; tsk_table_collection_t tables; tsk_treeseq_t ts; tsk_node_t node; tsk_individual_t individual; tsk_flags_t load_flags = TSK_TS_INIT_BUILD_INDEXES; int ret; tsk_id_t pat_id, mat_id; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; parse_individuals(individuals, &tables.individuals); CU_ASSERT_EQUAL_FATAL(tables.individuals.num_rows, 3); parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 6); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_get_node(&ts, 0, &node); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(node.individual, TSK_NULL); ret = tsk_treeseq_get_node(&ts, 1, &node); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(node.individual, 1); ret = tsk_treeseq_get_individual(&ts, 0, &individual); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(individual.id, 0); CU_ASSERT_EQUAL_FATAL(individual.flags, 1); CU_ASSERT_EQUAL_FATAL(individual.location_length, 1); CU_ASSERT_EQUAL_FATAL(individual.location[0], 0.25); CU_ASSERT_EQUAL_FATAL(individual.parents_length, 2); CU_ASSERT_EQUAL_FATAL(individual.parents[0], -1); CU_ASSERT_EQUAL_FATAL(individual.parents[1], -1); pat_id = individual.id; CU_ASSERT_EQUAL_FATAL(individual.nodes_length, 1); CU_ASSERT_EQUAL_FATAL(individual.nodes[0], 3); ret = tsk_treeseq_get_individual(&ts, 1, &individual); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(individual.id, 1); CU_ASSERT_EQUAL_FATAL(individual.flags, 2); CU_ASSERT_EQUAL_FATAL(individual.location_length, 2); CU_ASSERT_EQUAL_FATAL(individual.location[0], 0.5); CU_ASSERT_EQUAL_FATAL(individual.location[1], 0.25); CU_ASSERT_EQUAL_FATAL(individual.parents_length, 2); CU_ASSERT_EQUAL_FATAL(individual.parents[0], -1); CU_ASSERT_EQUAL_FATAL(individual.parents[1], -1); mat_id = individual.id; CU_ASSERT_EQUAL_FATAL(individual.nodes_length, 2); CU_ASSERT_EQUAL_FATAL(individual.nodes[0], 1); CU_ASSERT_EQUAL_FATAL(individual.nodes[1], 4); ret = tsk_treeseq_get_individual(&ts, 2, &individual); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(individual.id, 2); CU_ASSERT_EQUAL_FATAL(individual.flags, 3); CU_ASSERT_EQUAL_FATAL(individual.location_length, 1); CU_ASSERT_EQUAL_FATAL(individual.location[0], 0.75); CU_ASSERT_EQUAL_FATAL(individual.parents_length, 2); CU_ASSERT_EQUAL_FATAL(individual.parents[0], pat_id); CU_ASSERT_EQUAL_FATAL(individual.parents[1], mat_id); CU_ASSERT_EQUAL_FATAL(individual.nodes_length, 1); CU_ASSERT_EQUAL_FATAL(individual.nodes[0], 5); ret = tsk_treeseq_get_individual(&ts, 3, &individual); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); /* NaN/ifinity values are allowed in locations they do not * affect the integrity of the model. */ tables.individuals.location[0] = NAN; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, 0); ret = tsk_treeseq_get_individual(&ts, 0, &individual); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT(!tsk_isfinite(individual.location[0])); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_simplest_bad_individuals(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "1 0 0\n" "0 1 0\n"; const char *edges = "0 1 2 0\n" "0 1 2 1\n" "0 1 4 3\n"; const char *individuals = "1 0.25 -1\n" "2 0.5,0.25 0\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_flags_t load_flags = TSK_TS_INIT_BUILD_INDEXES; tsk_id_t ret_id; int ret; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 5); parse_edges(edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 3); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /* Make sure we have a good set of records */ ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); /* Bad individual ID */ tables.nodes.individual[0] = -2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.nodes.individual[0] = TSK_NULL; /* Bad individual ID */ tables.nodes.individual[0] = 0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.nodes.individual[0] = TSK_NULL; /* Add two individuals */ parse_individuals(individuals, &tables.individuals); CU_ASSERT_EQUAL_FATAL(tables.individuals.num_rows, 2); /* Make sure we have a good set of records */ ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); /* Bad individual ID */ tables.nodes.individual[0] = 2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.nodes.individual[0] = TSK_NULL; /* Bad parent ID */ tables.individuals.parents[0] = -2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.individuals.parents[0] = 42; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.individuals.parents[0] = TSK_NULL; /* Parent is self */ tables.individuals.parents[0] = 0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_SELF_PARENT); tsk_treeseq_free(&ts); tables.individuals.parents[0] = TSK_NULL; /* Unsorted individuals are OK*/ tables.individuals.parents[0] = 1; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, 0); tsk_treeseq_free(&ts); tables.individuals.parents[0] = TSK_NULL; tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_simplest_bad_edges(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "1 0 0\n" "0 1 0\n"; const char *edges = "0 1 2 0\n" "0 1 2 1\n" "0 1 4 3\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; int ret; tsk_id_t ret_id; tsk_flags_t load_flags = TSK_TS_INIT_BUILD_INDEXES; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 5); parse_edges(edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 3); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /* Make sure we have a good set of records */ ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); /* Bad population ID */ tables.nodes.population[0] = -2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.nodes.population[0] = 0; /* Bad population ID */ tables.nodes.population[0] = 1; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.nodes.population[0] = 0; /* Bad interval */ tables.edges.right[0] = 0.0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_EDGE_INTERVAL); tsk_treeseq_free(&ts); tables.edges.right[0] = 1.0; /* Nonfinite coords */ tables.edges.left[0] = NAN; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_GENOME_COORDS_NONFINITE); tsk_treeseq_free(&ts); tables.edges.left[0] = 1.0; tables.edges.left[0] = INFINITY; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_GENOME_COORDS_NONFINITE); tsk_treeseq_free(&ts); tables.edges.left[0] = 1.0; tables.edges.right[0] = NAN; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_GENOME_COORDS_NONFINITE); tsk_treeseq_free(&ts); tables.edges.right[0] = 1.0; tables.edges.right[0] = -INFINITY; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_GENOME_COORDS_NONFINITE); tsk_treeseq_free(&ts); tables.edges.right[0] = 1.0; /* Left coordinate < 0. */ tables.edges.left[0] = -1; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_LEFT_LESS_ZERO); tsk_treeseq_free(&ts); tables.edges.left[0] = 0.0; /* Right coordinate > sequence length. */ tables.edges.right[0] = 2.0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_RIGHT_GREATER_SEQ_LENGTH); tsk_treeseq_free(&ts); tables.edges.right[0] = 1.0; /* Duplicate records */ tables.edges.child[0] = 1; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_DUPLICATE_EDGES); tsk_treeseq_free(&ts); tables.edges.child[0] = 0; /* Duplicate records */ tables.edges.child[0] = 1; tables.edges.left[0] = 0.5; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_EDGES_NOT_SORTED_LEFT); tsk_treeseq_free(&ts); tables.edges.child[0] = 0; tables.edges.left[0] = 0.0; /* child node == parent */ tables.edges.child[1] = 2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_NODE_TIME_ORDERING); tsk_treeseq_free(&ts); tables.edges.child[1] = 1; /* Unsorted child nodes */ tables.edges.child[0] = 1; tables.edges.child[1] = 0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_EDGES_NOT_SORTED_CHILD); tsk_treeseq_free(&ts); tables.edges.child[0] = 0; tables.edges.child[1] = 1; /* discontinuous parent nodes */ /* Swap rows 1 and 2 */ tables.edges.parent[1] = 4; tables.edges.child[1] = 3; tables.edges.parent[2] = 2; tables.edges.child[2] = 1; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS); tsk_treeseq_free(&ts); tables.edges.parent[2] = 4; tables.edges.child[2] = 3; tables.edges.parent[1] = 2; tables.edges.child[1] = 1; /* Null parent */ tables.edges.parent[0] = TSK_NULL; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_NULL_PARENT); tsk_treeseq_free(&ts); tables.edges.parent[0] = 2; /* parent not in nodes list */ tables.nodes.num_rows = 2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.nodes.num_rows = 5; /* parent negative */ tables.edges.parent[0] = -2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.edges.parent[0] = 2; /* Null child */ tables.edges.child[0] = TSK_NULL; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_NULL_CHILD); tsk_treeseq_free(&ts); tables.edges.child[0] = 0; /* child node reference out of bounds */ tables.edges.child[0] = 100; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.edges.child[0] = 0; /* child node reference negative */ tables.edges.child[0] = -2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.edges.child[0] = 0; /* Make sure we've preserved a good tree sequence */ ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, 0); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_simplest_bad_indexes(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "1 0 0\n" "0 1 0\n"; const char *edges = "0 1 2 0\n" "0 1 2 1\n" "0 1 4 3\n"; tsk_table_collection_t tables; tsk_id_t bad_indexes[] = { -1, 3, 4, 1000 }; tsk_size_t j; tsk_id_t ret_id; tsk_id_t ret_num_trees; int ret; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 5); parse_edges(edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 3); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /* Make sure we have a good set of records */ ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, TSK_CHECK_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLES_NOT_INDEXED); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_num_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); /* TSK_CHECK_TREES returns the number of trees */ CU_ASSERT_EQUAL_FATAL(ret_num_trees, 1); for (j = 0; j < sizeof(bad_indexes) / sizeof(*bad_indexes); j++) { tables.indexes.edge_insertion_order[0] = bad_indexes[j]; ret_num_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_num_trees, TSK_ERR_EDGE_OUT_OF_BOUNDS); tables.indexes.edge_insertion_order[0] = 0; tables.indexes.edge_removal_order[0] = bad_indexes[j]; ret_num_trees = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret_num_trees, TSK_ERR_EDGE_OUT_OF_BOUNDS); tables.indexes.edge_removal_order[0] = 0; } ret = tsk_table_collection_drop_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity(&tables, TSK_CHECK_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLES_NOT_INDEXED); tsk_table_collection_free(&tables); } static void test_simplest_bad_migrations(void) { tsk_table_collection_t tables; int ret; tsk_id_t ret_id; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; /* insert two populations and one node to refer to. */ ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); /* One migration, node 0 goes from population 0 to 1. */ ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 1, 1.0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /* We only need basic intregity checks for migrations */ ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Bad node reference */ tables.migrations.node[0] = -1; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tables.migrations.node[0] = 0; /* Bad node reference */ tables.migrations.node[0] = 1; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tables.migrations.node[0] = 0; /* Bad population reference */ tables.migrations.source[0] = -1; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tables.migrations.source[0] = 0; /* Bad population reference */ tables.migrations.source[0] = 2; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tables.migrations.source[0] = 0; /* Bad population reference */ tables.migrations.dest[0] = -1; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tables.migrations.dest[0] = 1; /* Bad population reference */ tables.migrations.dest[0] = 2; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tables.migrations.dest[0] = 1; /* Bad time values */ tables.migrations.time[0] = NAN; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_TIME_NONFINITE); tables.migrations.time[0] = 1.0; tables.migrations.time[0] = INFINITY; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_TIME_NONFINITE); tables.migrations.time[0] = 1.0; /* Bad left coordinate */ tables.migrations.left[0] = -1; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_LEFT_LESS_ZERO); tables.migrations.left[0] = 0; tables.migrations.left[0] = NAN; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_GENOME_COORDS_NONFINITE); tables.migrations.left[0] = 0; tables.migrations.left[0] = -INFINITY; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_GENOME_COORDS_NONFINITE); tables.migrations.left[0] = 0; /* Bad right coordinate */ tables.migrations.right[0] = 2; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_RIGHT_GREATER_SEQ_LENGTH); tables.migrations.right[0] = 1; tables.migrations.right[0] = NAN; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_GENOME_COORDS_NONFINITE); tables.migrations.right[0] = 1; tables.migrations.right[0] = INFINITY; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_GENOME_COORDS_NONFINITE); tables.migrations.right[0] = 1; /* Bad interval coordinate */ tables.migrations.right[0] = 0; ret = (int) tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_EDGE_INTERVAL); tables.migrations.right[0] = 1; tsk_table_collection_free(&tables); } static void test_simplest_migration_simplify(void) { tsk_table_collection_t tables; int ret; tsk_id_t ret_id; tsk_id_t samples[] = { 0, 1 }; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; /* insert two populations and one node to refer to. */ ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row( &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); /* One migration, node 0 goes from population 0 to 1. */ ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 1, 1.0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED); tsk_table_collection_free(&tables); } static void test_simplest_overlapping_parents(void) { const char *nodes = "1 0 -1\n" "1 0 -1\n" "0 1 -1\n"; const char *edges = "0 1 2 0\n" "0 1 2 1\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_tree_t tree; int ret; tsk_flags_t load_flags = TSK_TS_INIT_BUILD_INDEXES; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 3); parse_edges(edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 2); tables.edges.left[0] = 0; tables.edges.parent[0] = 2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tree.parent[0], 2); CU_ASSERT_EQUAL(tree.parent[1], 2); CU_ASSERT_EQUAL(tree.left_sib[2], TSK_NULL); CU_ASSERT_EQUAL(tree.right_sib[2], TSK_NULL); CU_ASSERT_EQUAL(tree.left_child[2], 0); CU_ASSERT_EQUAL(tree.right_child[2], 1); CU_ASSERT_EQUAL(tree.left_sib[0], TSK_NULL); CU_ASSERT_EQUAL(tree.right_sib[0], 1); CU_ASSERT_EQUAL(tree.left_sib[1], 0); CU_ASSERT_EQUAL(tree.right_sib[1], TSK_NULL); CU_ASSERT_EQUAL(tree.num_children[2], 2); tsk_tree_free(&tree); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_simplest_contradictory_children(void) { const char *nodes = "1 0 -1\n" "1 1 -1\n" "0 1 -1\n"; const char *edges = "0 1 1 0\n" "0 1 2 0\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; int ret; tsk_flags_t load_flags = TSK_TS_INIT_BUILD_INDEXES; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 3); parse_edges(edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 2); tables.sequence_length = 1.0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_simplest_overlapping_edges_simplify(void) { const char *nodes = "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "0 1 -1"; const char *edges = "0 2 3 0\n" "1 3 3 1\n" "0 3 3 2\n"; tsk_id_t samples[] = { 0, 1, 2 }; tsk_table_collection_t tables; int ret; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 3; parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 4); parse_edges(edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 3); ret = tsk_table_collection_simplify(&tables, samples, 3, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 4); CU_ASSERT_EQUAL(tables.edges.num_rows, 3); /* Identical to the input. 0 2 3 0 1 3 3 1 0 3 3 2 */ CU_ASSERT_EQUAL(tables.edges.left[0], 0); CU_ASSERT_EQUAL(tables.edges.left[1], 1); CU_ASSERT_EQUAL(tables.edges.left[2], 0); CU_ASSERT_EQUAL(tables.edges.right[0], 2); CU_ASSERT_EQUAL(tables.edges.right[1], 3); CU_ASSERT_EQUAL(tables.edges.right[2], 3); CU_ASSERT_EQUAL(tables.edges.parent[0], 3); CU_ASSERT_EQUAL(tables.edges.parent[1], 3); CU_ASSERT_EQUAL(tables.edges.parent[2], 3); CU_ASSERT_EQUAL(tables.edges.child[0], 0); CU_ASSERT_EQUAL(tables.edges.child[1], 1); CU_ASSERT_EQUAL(tables.edges.child[2], 2); tsk_table_collection_free(&tables); } static void test_simplest_overlapping_unary_edges_simplify(void) { const char *nodes = "1 0 -1\n" "1 0 -1\n" "0 1 -1"; const char *edges = "0 2 2 0\n" "1 3 2 1\n"; tsk_id_t samples[] = { 0, 1 }; tsk_table_collection_t tables; int ret; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 3; parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 3); parse_edges(edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 2); ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 3); CU_ASSERT_EQUAL(tables.edges.num_rows, 2); /* Because we only sample 0 and 1, the flanking unary edges are removed 1 2 2 0 1 2 2 1 */ CU_ASSERT_EQUAL(tables.edges.left[0], 1); CU_ASSERT_EQUAL(tables.edges.right[0], 2); CU_ASSERT_EQUAL(tables.edges.parent[0], 2); CU_ASSERT_EQUAL(tables.edges.child[0], 0); CU_ASSERT_EQUAL(tables.edges.left[1], 1); CU_ASSERT_EQUAL(tables.edges.right[1], 2); CU_ASSERT_EQUAL(tables.edges.parent[1], 2); CU_ASSERT_EQUAL(tables.edges.child[1], 1); tsk_table_collection_free(&tables); } static void test_simplest_overlapping_unary_edges_internal_samples_simplify(void) { const char *nodes = "1 0 -1\n" "1 0 -1\n" "1 1 -1"; const char *edges = "0 2 2 0\n" "1 3 2 1\n"; tsk_id_t samples[] = { 0, 1, 2 }; tsk_table_collection_t tables; int ret; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 3; parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 3); parse_edges(edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 2); ret = tsk_table_collection_simplify(&tables, samples, 3, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 3); CU_ASSERT_EQUAL(tables.edges.num_rows, 2); /* Identical to the input. 0 2 2 0 1 3 2 1 */ CU_ASSERT_EQUAL(tables.edges.left[0], 0); CU_ASSERT_EQUAL(tables.edges.left[1], 1); CU_ASSERT_EQUAL(tables.edges.right[0], 2); CU_ASSERT_EQUAL(tables.edges.right[1], 3); CU_ASSERT_EQUAL(tables.edges.parent[0], 2); CU_ASSERT_EQUAL(tables.edges.parent[1], 2); CU_ASSERT_EQUAL(tables.edges.child[0], 0); CU_ASSERT_EQUAL(tables.edges.child[1], 1); tsk_table_collection_free(&tables); } static void test_simplest_reduce_site_topology(void) { /* Two trees side by side, with a site on the second one. The first * tree should disappear. */ const char *nodes = "1 0 -1\n" "1 0 -1\n" "0 1 -1\n" "0 2 -1\n"; const char *edges = "0 1 2 0\n" "0 1 2 1\n" "1 2 3 0\n" "1 2 3 1\n"; const char *sites = "1.0 0\n"; tsk_id_t samples[] = { 0, 1 }; tsk_table_collection_t tables; int ret; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 2; parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 4); parse_edges(edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 4); parse_sites(sites, &tables.sites); CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 1); ret = tsk_table_collection_simplify( &tables, samples, 2, TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 3); CU_ASSERT_EQUAL(tables.edges.num_rows, 2); CU_ASSERT_EQUAL(tables.edges.left[0], 0); CU_ASSERT_EQUAL(tables.edges.left[1], 0); CU_ASSERT_EQUAL(tables.edges.right[0], 2); CU_ASSERT_EQUAL(tables.edges.right[1], 2); CU_ASSERT_EQUAL(tables.edges.parent[0], 2); CU_ASSERT_EQUAL(tables.edges.parent[1], 2); CU_ASSERT_EQUAL(tables.edges.child[0], 0); CU_ASSERT_EQUAL(tables.edges.child[1], 1); tsk_table_collection_free(&tables); } static void test_simplest_simplify_defragment(void) { const char *nodes = "0 2 -1\n" "0 2 -1\n" "0 2 -1\n" "0 2 -1\n" "0 2 -1\n" "0 2 -1\n" "0 1 -1\n" "0 1 -1\n" "0 1 -1\n" "0 1 -1\n" "0 1 -1\n" "0 1 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n" "1 0 -1\n"; const char *edges = "0.00000000 0.20784841 8 12\n" "0.00000000 0.42202433 8 15\n" "0.00000000 0.63541014 8 16\n" "0.42202433 1.00000000 9 15\n" "0.00000000 1.00000000 9 17\n" "0.00000000 1.00000000 10 14\n" "0.20784841 1.00000000 11 12\n" "0.00000000 1.00000000 11 13\n" "0.63541014 1.00000000 11 16\n" "0.00000000 1.00000000 0 10\n" "0.62102072 1.00000000 1 9\n" "0.00000000 1.00000000 1 11\n" "0.00000000 0.26002984 2 6\n" "0.26002984 1.00000000 2 6\n" "0.00000000 0.62102072 2 9\n" "0.55150554 1.00000000 3 8\n" "0.00000000 1.00000000 4 7\n" "0.00000000 0.55150554 5 8\n"; tsk_id_t samples[] = { 12, 13, 14, 15, 16, 17 }; tsk_table_collection_t tables; int ret; /* This was the simplest example I could find that exercised the * inner loops of the simplifier_extract_ancestry function */ ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 18); parse_edges(edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 18); ret = tsk_table_collection_simplify(&tables, samples, 6, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 10); CU_ASSERT_EQUAL(tables.edges.num_rows, 10); tsk_table_collection_free(&tables); } static void test_simplest_population_filter(void) { tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1 }; int ret; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; tsk_population_table_add_row(&tables.populations, "0", 1); tsk_population_table_add_row(&tables.populations, "1", 1); tsk_population_table_add_row(&tables.populations, "2", 1); /* Two nodes referring to population 1 */ tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 1, TSK_NULL, NULL, 0); tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 1, TSK_NULL, NULL, 0); ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 2); CU_ASSERT_EQUAL(tables.populations.num_rows, 3); CU_ASSERT_EQUAL(tables.populations.metadata[0], '0'); CU_ASSERT_EQUAL(tables.populations.metadata[1], '1'); CU_ASSERT_EQUAL(tables.populations.metadata[2], '2'); ret = tsk_table_collection_simplify( &tables, samples, 2, TSK_SIMPLIFY_FILTER_POPULATIONS, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 2); CU_ASSERT_EQUAL(tables.nodes.population[0], 0); CU_ASSERT_EQUAL(tables.nodes.population[1], 0); CU_ASSERT_EQUAL(tables.populations.num_rows, 1); CU_ASSERT_EQUAL(tables.populations.metadata[0], '1'); tsk_table_collection_free(&tables); } static void test_simplest_individual_filter(void) { tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1 }; int ret; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0, "0", 1); tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0, "1", 1); tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0, "2", 1); /* Two nodes referring to individual 1 */ tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0); tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0); ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 2); CU_ASSERT_EQUAL(tables.individuals.num_rows, 3); CU_ASSERT_EQUAL(tables.individuals.metadata[0], '0'); CU_ASSERT_EQUAL(tables.individuals.metadata[1], '1'); CU_ASSERT_EQUAL(tables.individuals.metadata[2], '2'); ret = tsk_table_collection_simplify( &tables, samples, 2, TSK_SIMPLIFY_FILTER_INDIVIDUALS, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 2); CU_ASSERT_EQUAL(tables.nodes.individual[0], 0); CU_ASSERT_EQUAL(tables.nodes.individual[1], 0); CU_ASSERT_EQUAL(tables.individuals.num_rows, 1); CU_ASSERT_EQUAL(tables.individuals.metadata[0], '1'); tsk_table_collection_free(&tables); } static void test_simplest_no_node_filter(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 1 0"; /* unreferenced node */ const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts, simplified; tsk_id_t sample_ids[] = { 0, 1 }; tsk_id_t node_map[] = { -1, -1, -1, -1 }; tsk_id_t j; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_simplify( &ts, NULL, 0, TSK_SIMPLIFY_NO_FILTER_NODES, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_NO_FILTER_NODES, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); /* Reversing sample order makes no difference */ sample_ids[0] = 1; sample_ids[1] = 0; ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_NO_FILTER_NODES, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify( &ts, sample_ids, 1, TSK_SIMPLIFY_NO_FILTER_NODES, &simplified, node_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&simplified), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_num_edges(&simplified), 0); for (j = 0; j < 4; j++) { CU_ASSERT_EQUAL(node_map[j], j); } tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify(&ts, sample_ids, 1, TSK_SIMPLIFY_NO_FILTER_NODES | TSK_SIMPLIFY_KEEP_INPUT_ROOTS | TSK_SIMPLIFY_KEEP_UNARY, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&simplified), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_num_edges(&simplified), 1); tsk_treeseq_free(&simplified); sample_ids[0] = 0; sample_ids[1] = 0; ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_NO_FILTER_NODES, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); } static void test_simplest_no_update_flags(void) { const char *nodes = "0 0 0\n" "1 0 0\n" "0 1 0\n"; const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts, simplified; tsk_id_t sample_ids[] = { 0, 1 }; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); /* We have a mixture of sample and non-samples in the input tables */ ret = tsk_treeseq_simplify( &ts, sample_ids, 2, TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); ret = tsk_treeseq_simplify(&ts, sample_ids, 2, TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS | TSK_SIMPLIFY_NO_FILTER_NODES, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); } static void test_simplest_map_mutations(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts; tsk_tree_t t; int32_t genotypes[] = { 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_tree_next(&t)); ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 0); free(transitions); genotypes[0] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); free(transitions); genotypes[0] = -1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 0); free(transitions); /* Check the null tree */ genotypes[0] = 1; CU_ASSERT_FALSE(tsk_tree_next(&t)); ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); free(transitions); /* Assign the ancestral_state */ genotypes[0] = 1; genotypes[1] = 1; ancestral_state = 0; ret = tsk_tree_map_mutations(&t, genotypes, NULL, TSK_MM_FIXED_ANCESTRAL_STATE, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 2); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); CU_ASSERT_EQUAL_FATAL(transitions[1].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[1].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[1].state, 1); free(transitions); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_simplest_nonbinary_map_mutations(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges = "0 1 4 0,1,2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; int32_t genotypes[] = { 0, 0, 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_tree_next(&t)); ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 0); free(transitions); genotypes[0] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); free(transitions); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_simplest_unary_map_mutations(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 1 0\n" "0 2 0"; const char *edges = "0 1 2 0\n" "0 1 3 1\n" "0 1 4 2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; int32_t genotypes[] = { 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_tree_next(&t)); ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 0); free(transitions); genotypes[0] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 2); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); free(transitions); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_simplest_non_sample_leaf_map_mutations(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 0 0\n" "0 0 0"; const char *edges = "0 1 2 0,1,3,4\n"; tsk_treeseq_t ts; tsk_tree_t t; int32_t genotypes[] = { 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_tree_next(&t)); ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 0); free(transitions); genotypes[0] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); free(transitions); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_simplest_internal_sample_map_mutations(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 1 0"; const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts; tsk_tree_t t; int32_t genotypes[] = { 0, 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_tree_next(&t)); ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 0); free(transitions); genotypes[0] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); free(transitions); genotypes[2] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 1); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 0); free(transitions); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_simplest_multiple_root_map_mutations(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 1 0\n"; const char *edges = "0 1 4 0,1\n" "0 1 5 2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; int32_t genotypes[] = { 0, 0, 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_tree_next(&t)); ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 0); free(transitions); genotypes[0] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); free(transitions); genotypes[1] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 4); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); free(transitions); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_simplest_chained_map_mutations(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 1 0\n" "1 1 0\n" "0 2 0"; const char *edges = "0 1 2 0\n" "0 1 3 1\n" "0 1 4 2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; int32_t genotypes[] = { 0, 0, 0, 0 }; tsk_size_t num_transitions; tsk_state_transition_t *transitions; int32_t ancestral_state; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_tree_next(&t)); ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 0); free(transitions); genotypes[2] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 2); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 2); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); CU_ASSERT_EQUAL_FATAL(transitions[1].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[1].parent, 0); CU_ASSERT_EQUAL_FATAL(transitions[1].state, 0); free(transitions); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_simplest_mutation_edges(void) { const char *nodes = "1 0 0\n" "0 1 0\n" "0 1 0"; const char *edges = "0 1 1 0\n" "1 2 2 0\n"; const char *sites = "0.5 0\n" "1.5 0\n"; const char *mutations = "0 2 1\n" "0 1 1\n" "0 0 1\n" "1 2 1\n" "1 1 1\n" "1 0 1\n"; tsk_treeseq_t ts; tsk_tree_t tree; /* We have mutations over roots, samples and just isolated nodes */ tsk_id_t mutation_edges[] = { -1, -1, 0, -1, -1, 1 }; tsk_size_t i, j, k, t; tsk_mutation_t mut; tsk_site_t site; int ret; tsk_treeseq_from_text(&ts, 2, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 6); for (j = 0; j < tsk_treeseq_get_num_mutations(&ts); j++) { ret = tsk_treeseq_get_mutation(&ts, (tsk_id_t) j, &mut); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(mut.edge, mutation_edges[j]); } ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL(ret, 0); i = 0; for (t = 0; t < 2; t++) { ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); for (j = 0; j < tree.sites_length; j++) { site = tree.sites[j]; for (k = 0; k < site.mutations_length; k++) { CU_ASSERT_EQUAL(site.mutations[k].edge, mutation_edges[i]); i++; } } } CU_ASSERT_EQUAL(i, 6); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } /*======================================================= * Single tree tests. *======================================================*/ static void test_single_tree_good_records(void) { tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 7); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); verify_edge_array_trees(&ts); tsk_treeseq_free(&ts); } static void test_single_nonbinary_tree_good_records(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 2 0\n" "0 3 0\n"; const char *edges = "0 1 7 0,1,2,3\n" "0 1 8 4,5\n" "0 1 9 6,7,8"; tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 7); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 10); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); verify_edge_array_trees(&ts); tsk_treeseq_free(&ts); } static void test_single_tree_bad_records(void) { int ret = 0; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_flags_t load_flags = TSK_TS_INIT_BUILD_INDEXES; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(single_tree_ex_nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); parse_edges(single_tree_ex_edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); /* Not sorted in time order */ tables.nodes.time[5] = 0.5; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME); tsk_treeseq_free(&ts); tables.nodes.time[5] = 2.0; /* Left value greater than sequence right */ tables.edges.left[2] = 2.0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_EDGE_INTERVAL); tsk_treeseq_free(&ts); tables.edges.left[2] = 0.0; /* Non finite */ tables.nodes.time[5] = INFINITY; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_TIME_NONFINITE); tsk_treeseq_free(&ts); tables.nodes.time[5] = 2.0; tables.nodes.time[5] = NAN; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_TIME_NONFINITE); tsk_treeseq_free(&ts); tables.nodes.time[5] = 2.0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, 0); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_single_tree_good_mutations(void) { tsk_treeseq_t ts; tsk_size_t j; tsk_size_t num_sites = 3; tsk_size_t num_mutations = 7; tsk_site_t other_sites[num_sites]; tsk_mutation_t other_mutations[num_mutations]; int ret; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 1.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 7); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), num_sites); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), num_mutations); for (j = 0; j < num_sites; j++) { ret = tsk_treeseq_get_site(&ts, (tsk_id_t) j, other_sites + j); CU_ASSERT_EQUAL(ret, 0); } for (j = 0; j < num_mutations; j++) { ret = tsk_treeseq_get_mutation(&ts, (tsk_id_t) j, other_mutations + j); CU_ASSERT_EQUAL(ret, 0); } CU_ASSERT_EQUAL(other_sites[0].position, 0.125); CU_ASSERT_NSTRING_EQUAL(other_sites[0].ancestral_state, "0", 1); CU_ASSERT_EQUAL(other_sites[1].position, 0.25); CU_ASSERT_NSTRING_EQUAL(other_sites[1].ancestral_state, "0", 1); CU_ASSERT_EQUAL(other_sites[2].position, 0.5); CU_ASSERT_NSTRING_EQUAL(other_sites[2].ancestral_state, "0", 1); CU_ASSERT_EQUAL(other_mutations[0].id, 0); CU_ASSERT_EQUAL(other_mutations[0].node, 2); CU_ASSERT_NSTRING_EQUAL(other_mutations[0].derived_state, "1", 1); CU_ASSERT_NSTRING_EQUAL(other_mutations[0].inherited_state, "0", 1); CU_ASSERT_EQUAL(other_mutations[1].id, 1); CU_ASSERT_EQUAL(other_mutations[1].node, 4); CU_ASSERT_NSTRING_EQUAL(other_mutations[1].derived_state, "1", 1); CU_ASSERT_NSTRING_EQUAL(other_mutations[1].inherited_state, "0", 1); CU_ASSERT_EQUAL(other_mutations[2].id, 2); CU_ASSERT_EQUAL(other_mutations[2].node, 0); CU_ASSERT_NSTRING_EQUAL(other_mutations[2].derived_state, "0", 1); CU_ASSERT_NSTRING_EQUAL(other_mutations[2].inherited_state, "1", 1); CU_ASSERT_EQUAL(other_mutations[3].id, 3); CU_ASSERT_EQUAL(other_mutations[3].node, 0); CU_ASSERT_NSTRING_EQUAL(other_mutations[3].derived_state, "1", 1); CU_ASSERT_NSTRING_EQUAL(other_mutations[3].inherited_state, "0", 1); CU_ASSERT_EQUAL(other_mutations[4].id, 4); CU_ASSERT_EQUAL(other_mutations[4].node, 1); CU_ASSERT_NSTRING_EQUAL(other_mutations[4].derived_state, "1", 1); CU_ASSERT_NSTRING_EQUAL(other_mutations[4].inherited_state, "0", 1); CU_ASSERT_EQUAL(other_mutations[5].id, 5); CU_ASSERT_EQUAL(other_mutations[5].node, 2); CU_ASSERT_NSTRING_EQUAL(other_mutations[5].derived_state, "1", 1); CU_ASSERT_NSTRING_EQUAL(other_mutations[5].inherited_state, "0", 1); CU_ASSERT_EQUAL(other_mutations[6].id, 6); CU_ASSERT_EQUAL(other_mutations[6].node, 3); CU_ASSERT_NSTRING_EQUAL(other_mutations[6].derived_state, "1", 1); CU_ASSERT_NSTRING_EQUAL(other_mutations[6].inherited_state, "0", 1); tsk_treeseq_free(&ts); } static void test_single_tree_bad_mutations(void) { int ret = 0; const char *sites = "0 0\n" "0.1 0\n" "0.2 0\n"; const char *mutations = "0 0 1 -1 0\n" "1 1 1 -1 0\n" "2 4 1 -1 1\n" "2 1 0 2 0\n" "2 1 1 3 0\n" "2 2 1 -1 0\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_flags_t load_flags = TSK_TS_INIT_BUILD_INDEXES; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(single_tree_ex_nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); parse_edges(single_tree_ex_edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); parse_sites(sites, &tables.sites); parse_mutations(mutations, &tables.mutations); CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 3); CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 6); tables.sequence_length = 1.0; /* Check to make sure we have legal mutations */ ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 6); tsk_treeseq_free(&ts); /* negative coordinate */ tables.sites.position[0] = -1.0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_SITE_POSITION); tsk_treeseq_free(&ts); tables.sites.position[0] = 0.0; /* non finite coordinates */ tables.sites.position[0] = NAN; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_SITE_POSITION); tsk_treeseq_free(&ts); tables.sites.position[0] = 0.0; tables.sites.position[0] = INFINITY; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_SITE_POSITION); tsk_treeseq_free(&ts); tables.sites.position[0] = 0.0; /* coordinate == sequence length */ tables.sites.position[2] = 1.0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_SITE_POSITION); tsk_treeseq_free(&ts); tables.sites.position[2] = 0.2; /* coordinate > sequence length */ tables.sites.position[2] = 1.1; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_SITE_POSITION); tsk_treeseq_free(&ts); tables.sites.position[2] = 0.2; /* Duplicate positions */ tables.sites.position[0] = 0.1; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_DUPLICATE_SITE_POSITION); tsk_treeseq_free(&ts); tables.sites.position[0] = 0.0; /* Unsorted positions */ tables.sites.position[0] = 0.3; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_UNSORTED_SITES); tsk_treeseq_free(&ts); tables.sites.position[0] = 0.0; /* site < 0 */ tables.mutations.site[0] = -2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.mutations.site[0] = 0; /* site == num_sites */ tables.mutations.site[0] = 3; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.mutations.site[0] = 0; /* node = NULL */ tables.mutations.node[0] = TSK_NULL; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.mutations.node[0] = 0; /* node >= num_nodes */ tables.mutations.node[0] = 7; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.mutations.node[0] = 0; /* parent < -1 */ tables.mutations.parent[0] = -2; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.mutations.parent[0] = TSK_NULL; /* parent >= num_mutations */ tables.mutations.parent[0] = 7; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tables.mutations.parent[0] = TSK_NULL; /* parent on a different site */ tables.mutations.parent[1] = 0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE); tsk_treeseq_free(&ts); tables.mutations.parent[1] = TSK_NULL; /* parent is the same mutation */ tables.mutations.parent[0] = 0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_PARENT_EQUAL); tsk_treeseq_free(&ts); tables.mutations.parent[0] = TSK_NULL; /* parent_id > mutation id */ tables.mutations.parent[3] = 4; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_PARENT_AFTER_CHILD); tsk_treeseq_free(&ts); tables.mutations.parent[3] = 2; /* time < node time */ tables.mutations.time[2] = 0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE); tsk_treeseq_free(&ts); tables.mutations.time[2] = 1; /* time > parent mutation */ tables.mutations.time[4] = 0.5; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION); tsk_treeseq_free(&ts); tables.mutations.time[4] = 0; /* time > parent node */ tables.mutations.time[0] = 1.5; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE); tsk_treeseq_free(&ts); tables.mutations.time[0] = 0; /* Check to make sure we've maintained legal mutations */ ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 6); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_single_tree_iter(void) { int ret; const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 2 0\n" "0 3 0\n"; const char *edges = "0 6 4 0,1\n" "0 6 5 2,3\n" "0 6 6 4,5\n"; tsk_id_t parents[] = { 4, 4, 5, 5, 6, 6, TSK_NULL }; tsk_treeseq_t ts; tsk_tree_t tree; tsk_id_t u, v, w; tsk_size_t num_samples; tsk_size_t num_nodes = 7; tsk_treeseq_from_text(&ts, 6, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); verify_edge_array_trees(&ts); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), num_nodes); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); CU_ASSERT_EQUAL(tree.num_children[4], 2); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&tree), 1); tsk_tree_print_state(&tree, _devnull); for (u = 0; u < (tsk_id_t) num_nodes; u++) { ret = tsk_tree_get_parent(&tree, u, &v); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(v, parents[u]); } ret = tsk_tree_get_num_samples(&tree, 0, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 1); ret = tsk_tree_get_num_samples(&tree, 4, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 2); ret = tsk_tree_get_num_samples(&tree, 6, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 4); ret = tsk_tree_get_mrca(&tree, 0, 1, &w); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(w, 4); ret = tsk_tree_get_mrca(&tree, 0, 2, &w); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(w, 6); ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL(ret, 0); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_single_nonbinary_tree_iter(void) { int ret; const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 2 0\n" "0 3 0\n"; const char *edges = "0 1 7 0,1,2,3\n" "0 1 8 4,5\n" "0 1 9 6,7,8\n"; tsk_id_t parents[] = { 7, 7, 7, 7, 8, 8, 9, 9, 9, TSK_NULL }; tsk_treeseq_t ts; tsk_tree_t tree; tsk_id_t u, v, w; tsk_size_t num_samples; tsk_size_t num_nodes = 10; tsk_size_t total_samples = 7; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); verify_edge_array_trees(&ts); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), num_nodes); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); tsk_tree_print_state(&tree, _devnull); for (u = 0; u < (tsk_id_t) num_nodes; u++) { ret = tsk_tree_get_parent(&tree, u, &v); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(v, parents[u]); } for (u = 0; u < (tsk_id_t) total_samples; u++) { ret = tsk_tree_get_num_samples(&tree, u, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 1); CU_ASSERT_EQUAL(tree.left_child[u], TSK_NULL); } u = 7; ret = tsk_tree_get_num_samples(&tree, u, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 4); CU_ASSERT_EQUAL(tree.right_child[u], 3); CU_ASSERT_EQUAL(tree.left_sib[3], 2); CU_ASSERT_EQUAL(tree.left_sib[2], 1); CU_ASSERT_EQUAL(tree.left_sib[1], 0); CU_ASSERT_EQUAL(tree.left_sib[0], TSK_NULL); CU_ASSERT_EQUAL(tree.num_children[u], 4); u = 8; ret = tsk_tree_get_num_samples(&tree, u, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 2); CU_ASSERT_EQUAL(tree.right_child[u], 5); CU_ASSERT_EQUAL(tree.left_sib[5], 4); CU_ASSERT_EQUAL(tree.left_sib[4], TSK_NULL); CU_ASSERT_EQUAL(tree.num_children[u], 2); u = 9; ret = tsk_tree_get_num_samples(&tree, u, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 7); CU_ASSERT_EQUAL(tree.right_child[u], 8); CU_ASSERT_EQUAL(tree.left_sib[8], 7); CU_ASSERT_EQUAL(tree.left_sib[7], 6); CU_ASSERT_EQUAL(tree.left_sib[6], TSK_NULL); CU_ASSERT_EQUAL(tree.num_children[u], 3); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&tree), 1); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&tree), 9); ret = tsk_tree_get_mrca(&tree, 0, 1, &w); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(w, 7); ret = tsk_tree_get_mrca(&tree, 0, 4, &w); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(w, 9); ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL(ret, 0); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_single_tree_general_samples_iter(void) { int ret; const char *nodes = "0 3 0\n" "0 2 0\n" "0 1 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n"; const char *edges = "0 6 2 3,4\n" "0 6 1 5,6\n" "0 6 0 1,2\n"; tsk_id_t parents[] = { TSK_NULL, 0, 0, 2, 2, 1, 1 }; const tsk_id_t *samples; tsk_treeseq_t ts; tsk_tree_t tree; tsk_id_t u, v, w; tsk_size_t num_samples; tsk_size_t num_nodes = 7; tsk_treeseq_from_text(&ts, 6, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); samples = tsk_treeseq_get_samples(&ts); CU_ASSERT_EQUAL(samples[0], 3); CU_ASSERT_EQUAL(samples[1], 4); CU_ASSERT_EQUAL(samples[2], 5); CU_ASSERT_EQUAL(samples[3], 6); verify_edge_array_trees(&ts); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), num_nodes); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); tsk_tree_print_state(&tree, _devnull); for (u = 0; u < (tsk_id_t) num_nodes; u++) { ret = tsk_tree_get_parent(&tree, u, &v); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(v, parents[u]); } ret = tsk_tree_get_num_samples(&tree, 3, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 1); ret = tsk_tree_get_num_samples(&tree, 2, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 2); ret = tsk_tree_get_num_samples(&tree, 0, &num_samples); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(num_samples, 4); ret = tsk_tree_get_mrca(&tree, 3, 4, &w); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(w, 2); ret = tsk_tree_get_mrca(&tree, 3, 6, &w); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(w, 0); ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL(ret, 0); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_single_tree_iter_times(void) { int ret = 0; const char *nodes = "1 0 0\n" "1 0 0\n" "1 2 0\n" "1 3 0\n" "0 1 0\n" "0 4 0\n" "0 5 0\n"; const char *edges = "0 6 4 0,1\n" "0 6 5 2,3\n" "0 6 6 4,5\n"; tsk_id_t parents[] = { 4, 4, 5, 5, 6, 6, TSK_NULL }; double times[] = { 0.0, 0.0, 2.0, 3.0, 1.0, 4.0, 5.0 }; double t; tsk_treeseq_t ts; tsk_tree_t tree; tsk_id_t u, v; uint32_t num_nodes = 7; tsk_treeseq_from_text(&ts, 6, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), num_nodes); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); tsk_tree_print_state(&tree, _devnull); for (u = 0; u < (tsk_id_t) num_nodes; u++) { ret = tsk_tree_get_parent(&tree, u, &v); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(v, parents[u]); ret = tsk_tree_get_time(&tree, u, &t); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(t, times[u]); } ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL(ret, 0); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_single_tree_iter_depths(void) { int ret = 0; const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 2 0\n" "0 3 0\n"; const char *edges = "0 6 4 0,1\n" "0 6 5 2,3\n" "0 6 6 4,5\n"; int depths[] = { 2, 2, 2, 2, 1, 1, 0 }; int depth; tsk_treeseq_t ts; tsk_tree_t tree; tsk_id_t u; uint32_t num_nodes = 7; tsk_treeseq_from_text(&ts, 6, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), num_nodes); for (u = 0; u < (tsk_id_t) num_nodes; u++) { ret = tsk_tree_get_depth(&tree, u, &depth); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(depth, depths[u]); } ret = tsk_tree_get_depth(&tree, (tsk_id_t) num_nodes + 1, &depth); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_get_depth(&tree, TSK_NULL, &depth); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL(ret, 0); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_single_tree_simplify(void) { tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1 }; int ret; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); verify_simplify(&ts); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 3); CU_ASSERT_EQUAL(tables.edges.num_rows, 2); /* Zero samples gives us the empty table collection */ ret = tsk_table_collection_simplify(&tables, samples, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 0); CU_ASSERT_EQUAL(tables.edges.num_rows, 0); /* Make sure we detect unsorted edges */ ret = tsk_treeseq_copy_tables(&ts, &tables, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); unsort_edges(&tables.edges, 0); ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGES_NOT_SORTED_CHILD); /* detect bad parents */ ret = tsk_treeseq_copy_tables(&ts, &tables, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.edges.parent[0] = -1; ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NULL_PARENT); /* detect bad children */ ret = tsk_treeseq_copy_tables(&ts, &tables, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.edges.child[0] = -1; ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NULL_CHILD); /* detect loops */ ret = tsk_treeseq_copy_tables(&ts, &tables, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.edges.child[0] = tables.edges.parent[0]; ret = tsk_table_collection_simplify(&tables, samples, 2, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NODE_TIME_ORDERING); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_single_tree_simplify_debug(void) { tsk_treeseq_t ts, simplified; tsk_id_t samples[] = { 0, 1 }; int ret; FILE *tmp = fopen(_tmp_file_name, "w"); CU_ASSERT_FATAL(tmp != NULL); tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); tsk_set_debug_stream(tmp); ret = tsk_treeseq_simplify(&ts, samples, 2, TSK_DEBUG, &simplified, NULL); tsk_set_debug_stream(stdout); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(ftell(tmp) > 0); fclose(tmp); tsk_treeseq_free(&ts); tsk_treeseq_free(&simplified); } static void test_single_tree_simplify_keep_input_roots(void) { tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_id_t samples[] = { 0, 1 }; int ret; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); verify_simplify(&ts); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_simplify( &tables, samples, 2, TSK_SIMPLIFY_KEEP_INPUT_ROOTS, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables.nodes.num_rows, 4); CU_ASSERT_EQUAL(tables.edges.num_rows, 3); CU_ASSERT_EQUAL(tables.sites.num_rows, 3); CU_ASSERT_EQUAL(tables.mutations.num_rows, 4); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_single_tree_simplify_no_sample_nodes(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t t1, t2; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_copy_tables(&ts, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* We zero out the sample column in t1, and run simplify. We should * get back the same table */ tsk_memset(t1.nodes.flags, 0, sizeof(*t1.nodes.flags) * t1.nodes.num_rows); ret = tsk_table_collection_simplify(&t1, samples, 4, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t1); tsk_table_collection_free(&t2); tsk_treeseq_free(&ts); } static void test_single_tree_simplify_null_samples(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t t1, t2; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_copy_tables(&ts, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_simplify(&t1, NULL, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t1); tsk_table_collection_free(&t2); tsk_treeseq_free(&ts); } static void test_single_tree_compute_mutation_parents(void) { int ret = 0; const char *sites = "0 0\n" "0.1 0\n" "0.2 0\n"; const char *mutations = "0 0 1 -1\n" "1 1 1 -1\n" "2 4 1 -1\n" "2 1 0 2 \n" "2 1 1 3 \n" "2 2 1 -1\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(single_tree_ex_nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); parse_edges(single_tree_ex_edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); parse_sites(sites, &tables.sites); parse_mutations(mutations, &tables.mutations); CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 3); CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 6); tables.sequence_length = 1.0; ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Check to make sure we have legal mutations */ ret = tsk_treeseq_init(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 6); /* Compute the mutation parents */ verify_compute_mutation_parents(&ts); tsk_treeseq_free(&ts); /* Bad site reference */ tables.mutations.site[0] = -1; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tables.mutations.site[0] = 0; /* Bad site reference */ tables.mutations.site[0] = -1; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tables.mutations.site[0] = 0; /* mutation sites out of order */ tables.mutations.site[0] = 2; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_UNSORTED_MUTATIONS); tables.mutations.site[0] = 0; /* sites out of order */ tables.sites.position[0] = 0.11; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_UNSORTED_SITES); tables.sites.position[0] = 0; /* Bad node reference */ tables.mutations.node[0] = -1; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tables.mutations.node[0] = 0; /* Bad node reference */ tables.mutations.node[0] = (tsk_id_t) tables.nodes.num_rows; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tables.mutations.node[0] = 0; /* Mutations not ordered by tree */ tables.mutations.node[2] = 1; tables.mutations.node[3] = 4; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_PARENT_AFTER_CHILD); tables.mutations.node[2] = 4; tables.mutations.node[3] = 1; /* Need to reset the parent field here */ tsk_memset( tables.mutations.parent, 0xff, tables.mutations.num_rows * sizeof(tsk_id_t)); /* Mutations not ordered by site */ tables.mutations.site[3] = 1; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_MUTATIONS); tables.mutations.site[3] = 2; /* Check to make sure we still have legal mutations */ ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, &tables, 0); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 6); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_single_tree_compute_mutation_times(void) { int ret = 0; const char *sites = "0 0\n" "0.1 0\n" "0.2 0\n" "0.3 0\n"; const char *mutations = "0 0 1 -1 3\n" "1 1 1 -1 3\n" "2 4 1 -1 8\n" "2 1 0 2 4\n" "2 2 1 -1 4\n" "2 1 1 3 2\n" "3 6 1 -1 10\n"; /* 6 */ /* 6 */ /* / \ */ /* / \ */ /* 2 \ */ /* / 5 */ /* 4 / \ */ /* 0 1,3,4 5 \ */ /* 0 1 2 3 */ tsk_treeseq_t ts; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(single_tree_ex_nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); tables.nodes.time[4] = 6; tables.nodes.time[5] = 8; tables.nodes.time[6] = 10; parse_edges(single_tree_ex_edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); parse_sites(sites, &tables.sites); parse_mutations(mutations, &tables.mutations); CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 4); CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 7); tables.sequence_length = 1.0; ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Check to make sure we have legal mutations */ ret = tsk_treeseq_init(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 7); /* Compute the mutation times */ verify_compute_mutation_times(&ts); /* Verify consistency of individuals */ verify_individual_nodes(&ts); tsk_treeseq_free(&ts); /* Bad random param */ ret = tsk_table_collection_compute_mutation_times(&tables, (double *) 1, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Bad site reference */ tables.mutations.site[0] = -1; ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tables.mutations.site[0] = 0; /* Bad site reference */ tables.mutations.site[0] = -1; ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tables.mutations.site[0] = 0; /* mutation sites out of order */ tables.mutations.site[0] = 2; ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_UNSORTED_MUTATIONS); tables.mutations.site[0] = 0; /* sites out of order */ tables.sites.position[0] = 0.11; ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_UNSORTED_SITES); tables.sites.position[0] = 0; /* Bad node reference */ tables.mutations.node[0] = -1; ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tables.mutations.node[0] = 0; /* Bad node reference */ tables.mutations.node[0] = (tsk_id_t) tables.nodes.num_rows; ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tables.mutations.node[0] = 0; /* Mutations not ordered by site */ tables.mutations.site[2] = 0; ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_MUTATIONS); tables.mutations.site[2] = 2; ret = tsk_treeseq_init(&ts, &tables, 0); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 7); tsk_treeseq_free(&ts); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_single_tree_mutation_edges(void) { int ret = 0; tsk_size_t i, j, k; tsk_treeseq_t ts; tsk_tree_t tree; tsk_mutation_t mut; tsk_site_t site; tsk_id_t mutation_edges[] = { 2, 4, 0, 0, 1, 2, 3 }; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); for (j = 0; j < 7; j++) { ret = tsk_treeseq_get_mutation(&ts, (tsk_id_t) j, &mut); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(mut.edge, mutation_edges[j]); } ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL(ret, TSK_TREE_OK); i = 0; for (j = 0; j < tree.sites_length; j++) { site = tree.sites[j]; for (k = 0; k < site.mutations_length; k++) { CU_ASSERT_EQUAL(site.mutations[k].edge, mutation_edges[i]); i++; } } CU_ASSERT_EQUAL(i, 7); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_single_tree_is_descendant(void) { int ret; tsk_treeseq_t ts; tsk_tree_t tree; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 0, 4)); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 1, 4)); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 0, 6)); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 1, 6)); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 4, 6)); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 2, 5)); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 3, 5)); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 2, 6)); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 3, 6)); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 5, 6)); /* Nodes are descendents of themselves. */ CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 0, 0)); CU_ASSERT_TRUE(tsk_tree_is_descendant(&tree, 1, 1)); CU_ASSERT_FALSE(tsk_tree_is_descendant(&tree, 0, 1)); CU_ASSERT_FALSE(tsk_tree_is_descendant(&tree, 0, 2)); CU_ASSERT_FALSE(tsk_tree_is_descendant(&tree, 0, 5)); /* Out of bounds nodes always return false.*/ CU_ASSERT_FALSE(tsk_tree_is_descendant(&tree, -1, 5)); CU_ASSERT_FALSE(tsk_tree_is_descendant(&tree, 100, 5)); CU_ASSERT_FALSE(tsk_tree_is_descendant(&tree, -1, -1)); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_single_tree_total_branch_length(void) { int ret; tsk_treeseq_t ts; tsk_tree_t tree; double length; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_total_branch_length(&tree, TSK_NULL, &length), 0); CU_ASSERT_EQUAL_FATAL(length, 9); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_total_branch_length(&tree, 7, &length), 0); CU_ASSERT_EQUAL_FATAL(length, 9); CU_ASSERT_EQUAL_FATAL( tsk_tree_get_total_branch_length(&tree, tree.virtual_root, &length), 0); CU_ASSERT_EQUAL_FATAL(length, 9); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_total_branch_length(&tree, 4, &length), 0); CU_ASSERT_EQUAL_FATAL(length, 2); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_total_branch_length(&tree, 0, &length), 0); CU_ASSERT_EQUAL_FATAL(length, 0); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_total_branch_length(&tree, 5, &length), 0); CU_ASSERT_EQUAL_FATAL(length, 4); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_total_branch_length(&tree, -2, &length), TSK_ERR_NODE_OUT_OF_BOUNDS); CU_ASSERT_EQUAL_FATAL( tsk_tree_get_total_branch_length(&tree, 8, &length), TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_single_tree_num_lineages(void) { int ret; tsk_treeseq_t ts; tsk_tree_t tree; tsk_size_t num_lineages; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL_FATAL(tsk_tree_num_lineages(&tree, 0, &num_lineages), 0); CU_ASSERT_EQUAL_FATAL(num_lineages, 4); CU_ASSERT_EQUAL_FATAL(tsk_tree_num_lineages(&tree, -1, &num_lineages), 0); CU_ASSERT_EQUAL_FATAL(num_lineages, 0); CU_ASSERT_EQUAL_FATAL(tsk_tree_num_lineages(&tree, 1, &num_lineages), 0); CU_ASSERT_EQUAL_FATAL(num_lineages, 3); CU_ASSERT_EQUAL_FATAL(tsk_tree_num_lineages(&tree, 2, &num_lineages), 0); CU_ASSERT_EQUAL_FATAL(num_lineages, 2); CU_ASSERT_EQUAL_FATAL(tsk_tree_num_lineages(&tree, 2.999, &num_lineages), 0); CU_ASSERT_EQUAL_FATAL(num_lineages, 2); CU_ASSERT_EQUAL_FATAL(tsk_tree_num_lineages(&tree, 3, &num_lineages), 0); CU_ASSERT_EQUAL_FATAL(num_lineages, 0); CU_ASSERT_EQUAL_FATAL(tsk_tree_num_lineages(&tree, 300, &num_lineages), 0); CU_ASSERT_EQUAL_FATAL(num_lineages, 0); CU_ASSERT_EQUAL_FATAL( tsk_tree_num_lineages(&tree, INFINITY, &num_lineages), TSK_ERR_TIME_NONFINITE); CU_ASSERT_EQUAL_FATAL( tsk_tree_num_lineages(&tree, NAN, &num_lineages), TSK_ERR_TIME_NONFINITE); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } static void test_single_tree_map_mutations(void) { tsk_treeseq_t ts; tsk_tree_t t; int32_t genotypes[] = { 0, 1, 1, 1 }; int ret = 0; tsk_size_t num_transitions; tsk_state_transition_t *transitions; int32_t ancestral_state, j; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 4); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_tree_next(&t)); ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 1); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 0); free(transitions); genotypes[0] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 1); CU_ASSERT_EQUAL_FATAL(num_transitions, 0); free(transitions); genotypes[0] = 0; genotypes[1] = 0; genotypes[2] = 0; genotypes[3] = 0; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 0); free(transitions); for (j = 1; j < 64; j++) { genotypes[0] = j; genotypes[1] = 0; genotypes[2] = 0; genotypes[3] = 0; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 1); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, j); free(transitions); } genotypes[0] = 2; genotypes[1] = 1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 2); CU_ASSERT_EQUAL_FATAL(transitions[0].node, 4); CU_ASSERT_EQUAL_FATAL(transitions[0].parent, TSK_NULL); CU_ASSERT_EQUAL_FATAL(transitions[0].state, 1); CU_ASSERT_EQUAL_FATAL(transitions[1].node, 0); CU_ASSERT_EQUAL_FATAL(transitions[1].parent, 0); CU_ASSERT_EQUAL_FATAL(transitions[1].state, 2); free(transitions); genotypes[0] = 1; genotypes[1] = 2; genotypes[2] = 3; genotypes[3] = 4; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 3); free(transitions); ancestral_state = 5; ret = tsk_tree_map_mutations(&t, genotypes, NULL, TSK_MM_FIXED_ANCESTRAL_STATE, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 4); CU_ASSERT_EQUAL_FATAL(ancestral_state, 5); free(transitions); ancestral_state = -1; ret = tsk_tree_map_mutations(&t, genotypes, NULL, TSK_MM_FIXED_ANCESTRAL_STATE, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_ANCESTRAL_STATE); ancestral_state = 64; ret = tsk_tree_map_mutations(&t, genotypes, NULL, TSK_MM_FIXED_ANCESTRAL_STATE, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_ANCESTRAL_STATE); genotypes[0] = 64; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_GENOTYPE); genotypes[0] = -2; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_GENOTYPE); genotypes[0] = -1; genotypes[1] = -1; genotypes[2] = -1; genotypes[3] = -1; ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_GENOTYPES_ALL_MISSING); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_single_tree_map_mutations_internal_samples(void) { /* Example derived from test case provoking a segfault */ const char *nodes = "0 0.00000000000000 0\n" "0 0.00000000000000 0\n" "1 0.00000000000000 0\n" "1 0.00000000000000 0\n" "1 0.00000000000000 0\n" "0 0.10792116530237 0\n" "1 1.00674711128465 0\n" "1 1.24675560985525 0\n" "0 1.78536352520779 0\n"; const char *edges = "0.00000000 1.00000000 5 0\n" "0.00000000 1.00000000 5 2\n" "0.00000000 1.00000000 6 4\n" "0.00000000 1.00000000 6 5\n" "0.00000000 1.00000000 7 1\n" "0.00000000 1.00000000 7 3\n" "0.00000000 1.00000000 8 6\n" "0.00000000 1.00000000 8 7\n"; tsk_treeseq_t ts; tsk_tree_t t; int32_t genotypes[] = { 0, 2, 2, 1, 0 }; int ret = 0; tsk_size_t num_transitions; tsk_state_transition_t *transitions; int32_t ancestral_state; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 5); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_tree_next(&t)); ret = tsk_tree_map_mutations( &t, genotypes, NULL, 0, &ancestral_state, &num_transitions, &transitions); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ancestral_state, 0); CU_ASSERT_EQUAL_FATAL(num_transitions, 4); free(transitions); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_single_tree_tracked_samples(void) { tsk_treeseq_t ts; tsk_tree_t tree; tsk_id_t samples[] = { 0, 1 }; tsk_size_t n; int ret; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_set_tracked_samples(&tree, 2, samples); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_get_num_tracked_samples(&tree, 0, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 1); ret = tsk_tree_get_num_tracked_samples(&tree, 4, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 0); ret = tsk_tree_get_num_tracked_samples(&tree, tree.virtual_root, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 2); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_get_num_tracked_samples(&tree, 0, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 1); ret = tsk_tree_get_num_tracked_samples(&tree, 4, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 2); ret = tsk_tree_get_num_tracked_samples(&tree, 5, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 0); ret = tsk_tree_get_num_tracked_samples(&tree, 6, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 2); ret = tsk_tree_get_num_tracked_samples(&tree, tree.virtual_root, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 2); ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_get_num_tracked_samples(&tree, 0, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 1); ret = tsk_tree_get_num_tracked_samples(&tree, 4, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 0); ret = tsk_tree_get_num_tracked_samples(&tree, tree.virtual_root, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 2); ret = tsk_tree_next(&tree); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_get_num_tracked_samples(&tree, 0, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 1); ret = tsk_tree_get_num_tracked_samples(&tree, 4, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 2); ret = tsk_tree_get_num_tracked_samples(&tree, tree.virtual_root, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 2); ret = tsk_tree_set_tracked_samples(&tree, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_get_num_tracked_samples(&tree, 0, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 0); ret = tsk_tree_get_num_tracked_samples(&tree, tree.virtual_root, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 0); tsk_treeseq_free(&ts); tsk_tree_free(&tree); } static void test_single_tree_tree_pos(void) { tsk_treeseq_t ts; tsk_tree_position_t tree_pos; bool valid; int ret; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_position_init(&tree_pos, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); valid = tsk_tree_position_next(&tree_pos); CU_ASSERT_FATAL(valid); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.left, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.right, 1); CU_ASSERT_EQUAL_FATAL(tree_pos.in.start, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.in.stop, 6); CU_ASSERT_EQUAL_FATAL(tree_pos.in.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_FORWARD); valid = tsk_tree_position_next(&tree_pos); CU_ASSERT_FATAL(!valid); tsk_tree_position_print_state(&tree_pos, _devnull); CU_ASSERT_EQUAL_FATAL(tree_pos.index, -1); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 6); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_FORWARD); valid = tsk_tree_position_prev(&tree_pos); CU_ASSERT_FATAL(valid); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.left, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.right, 1); CU_ASSERT_EQUAL_FATAL(tree_pos.in.start, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.in.stop, -1); CU_ASSERT_EQUAL_FATAL(tree_pos.in.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_REVERSE); valid = tsk_tree_position_prev(&tree_pos); CU_ASSERT_FATAL(!valid); CU_ASSERT_EQUAL_FATAL(tree_pos.index, -1); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, -1); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_REVERSE); ret = tsk_tree_position_seek_forward(&tree_pos, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.left, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.right, 1); CU_ASSERT_EQUAL_FATAL(tree_pos.in.start, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.in.stop, 6); CU_ASSERT_EQUAL_FATAL(tree_pos.in.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_removal_order) CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_FORWARD); valid = tsk_tree_position_next(&tree_pos); CU_ASSERT_FATAL(!valid); CU_ASSERT_EQUAL_FATAL(tree_pos.index, -1); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 6); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_FORWARD); ret = tsk_tree_position_seek_backward(&tree_pos, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.left, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.right, 1); CU_ASSERT_EQUAL_FATAL(tree_pos.in.start, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.in.stop, -1); CU_ASSERT_EQUAL_FATAL(tree_pos.in.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_REVERSE); tsk_tree_position_free(&tree_pos); tsk_treeseq_free(&ts); } /*======================================================= * Multi tree tests. *======================================================*/ static void test_simple_multi_tree(void) { // clang-format off tsk_id_t parents[] = { 6, 5, 8, 5, TSK_NULL, 6, 8, TSK_NULL, TSK_NULL, 6, 5, 4, 4, 5, 6, TSK_NULL, TSK_NULL, TSK_NULL, 7, 5, 4, 4, 5, 7, TSK_NULL, TSK_NULL, TSK_NULL, }; // clang-format on uint32_t num_trees = 3; tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); verify_trees(&ts, num_trees, parents); verify_edge_array_trees(&ts); tsk_treeseq_free(&ts); } static void test_multi_tree_direction_switching_tree_pos(void) { tsk_treeseq_t ts; tsk_tree_position_t tree_pos; bool valid; int ret = 0; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_tree_position_init(&tree_pos, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); valid = tsk_tree_position_next(&tree_pos); CU_ASSERT_FATAL(valid); CU_ASSERT_EQUAL_FATAL(tree_pos.index, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.left, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.right, 2); CU_ASSERT_EQUAL_FATAL(tree_pos.in.start, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.in.stop, 6); CU_ASSERT_EQUAL_FATAL(tree_pos.in.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_FORWARD); valid = tsk_tree_position_prev(&tree_pos); CU_ASSERT_FATAL(!valid); CU_ASSERT_EQUAL_FATAL(tree_pos.index, -1); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, -1); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_REVERSE); valid = tsk_tree_position_prev(&tree_pos); CU_ASSERT_FATAL(valid); CU_ASSERT_EQUAL_FATAL(tree_pos.index, 2); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.left, 7); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.right, 10); CU_ASSERT_EQUAL_FATAL(tree_pos.in.start, 10); CU_ASSERT_EQUAL_FATAL(tree_pos.in.stop, 4); CU_ASSERT_EQUAL_FATAL(tree_pos.in.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 10); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 10); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_REVERSE); valid = tsk_tree_position_next(&tree_pos); CU_ASSERT_FATAL(!valid); CU_ASSERT_EQUAL_FATAL(tree_pos.index, -1); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 11); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_FORWARD); ret = tsk_tree_position_seek_forward(&tree_pos, 2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.left, 7); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.right, 10); CU_ASSERT_EQUAL_FATAL(tree_pos.in.start, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.in.stop, 11); CU_ASSERT_EQUAL_FATAL(tree_pos.in.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_FORWARD); ret = tsk_tree_position_seek_backward(&tree_pos, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.index, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.left, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.right, 2); CU_ASSERT_EQUAL_FATAL(tree_pos.in.start, 4); CU_ASSERT_EQUAL_FATAL(tree_pos.in.stop, -1); CU_ASSERT_EQUAL_FATAL(tree_pos.in.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 10); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_REVERSE); ret = tsk_tree_position_seek_forward(&tree_pos, 2); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.index, 2); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.left, 7); CU_ASSERT_EQUAL_FATAL(tree_pos.interval.right, 10); CU_ASSERT_EQUAL_FATAL(tree_pos.in.start, 6); CU_ASSERT_EQUAL_FATAL(tree_pos.in.stop, 11); CU_ASSERT_EQUAL_FATAL(tree_pos.in.order, ts.tables->indexes.edge_insertion_order); CU_ASSERT_EQUAL_FATAL(tree_pos.out.start, 0); CU_ASSERT_EQUAL_FATAL(tree_pos.out.stop, 5); CU_ASSERT_EQUAL_FATAL(tree_pos.out.order, ts.tables->indexes.edge_removal_order); CU_ASSERT_EQUAL_FATAL(tree_pos.direction, TSK_DIR_FORWARD); tsk_tree_position_free(&tree_pos); tsk_treeseq_free(&ts); } static void test_unary_multi_tree(void) { // clang-format off tsk_id_t parents[] = { 6, 5, 7, 5, TSK_NULL, 6, 8, 8, TSK_NULL, 5, 6, 5, 4, 4, 5, 6, 8, TSK_NULL, TSK_NULL, 5, 7, 5, 4, 4, 5, 7, TSK_NULL, TSK_NULL, TSK_NULL, 5, }; // clang-format on tsk_treeseq_t ts; uint32_t num_trees = 3; tsk_treeseq_from_text(&ts, 10, unary_ex_nodes, unary_ex_edges, NULL, unary_ex_sites, unary_ex_mutations, NULL, NULL, 0); verify_trees(&ts, num_trees, parents); verify_edge_array_trees(&ts); tsk_treeseq_free(&ts); } static void test_internal_sample_multi_tree(void) { // clang-format off tsk_id_t parents[] = { 7, 5, 4, 4, 5, 7, TSK_NULL, TSK_NULL, TSK_NULL, 4, 5, 4, 8, 5, 8, TSK_NULL, TSK_NULL, TSK_NULL, 6, 5, 4, 4, 5, 6, TSK_NULL, TSK_NULL, TSK_NULL, }; // clang-format on tsk_treeseq_t ts; uint32_t num_trees = 3; tsk_treeseq_from_text(&ts, 10, internal_sample_ex_nodes, internal_sample_ex_edges, NULL, internal_sample_ex_sites, internal_sample_ex_mutations, NULL, NULL, 0); verify_trees(&ts, num_trees, parents); verify_edge_array_trees(&ts); tsk_treeseq_free(&ts); } static void test_internal_sample_simplified_multi_tree(void) { int ret; tsk_treeseq_t ts, simplified; tsk_id_t samples[] = { 2, 3, 5 }; tsk_id_t node_map[9]; tsk_id_t z = TSK_NULL; // clang-format off tsk_id_t parents[] = { /* 0 1 2 3 4 */ 3, 3, z, 2, z, 2, 4, 4, z, z, 3, 3, z, 2, z, }; // clang-format on uint32_t num_trees = 3; tsk_treeseq_from_text(&ts, 10, internal_sample_ex_nodes, internal_sample_ex_edges, NULL, internal_sample_ex_sites, internal_sample_ex_mutations, NULL, NULL, 0); ret = tsk_treeseq_simplify(&ts, samples, 3, 0, &simplified, node_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(node_map[2], 0); CU_ASSERT_EQUAL(node_map[3], 1); CU_ASSERT_EQUAL(node_map[5], 2); verify_trees(&simplified, num_trees, parents); verify_edge_array_trees(&ts); tsk_treeseq_free(&simplified); tsk_treeseq_free(&ts); } static void test_nonbinary_multi_tree(void) { /* We make one mutation for each tree */ // clang-format off tsk_id_t parents[] = { 8, 8, 8, 8, 10, 10, 9, 10, 9, 12, 12, TSK_NULL, TSK_NULL, 8, 8, 8, 8, 10, 11, 9, 10, 9, 11, 12, 12, TSK_NULL, }; // clang-format on tsk_treeseq_t ts; uint32_t num_trees = 2; tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, nonbinary_ex_sites, nonbinary_ex_mutations, NULL, NULL, 0); verify_trees(&ts, num_trees, parents); verify_edge_array_trees(&ts); tsk_treeseq_free(&ts); } static void test_simplify_keep_input_roots_multi_tree(void) { /* 0.25┊ 8 ┊ ┊ ┊ ┊ ┏━┻━┓ ┊ ┊ ┊ 0.20┊ ┃ ┃ ┊ ┊ 7 ┊ ┊ ┃ ┃ ┊ ┊ ┏━┻━┓ ┊ 0.17┊ 6 ┃ ┊ 6 ┊ ┃ ┃ ┊ ┊ ┏━┻┓ ┃ ┊ ┏━┻━┓ ┊ ┃ ┃ ┊ 0.09┊ ┃ 5 ┃ ┊ ┃ 5 ┊ ┃ 5 ┊ ┊ ┃ ┏┻┓ ┃ ┊ ┃ ┏━┻┓ ┊ ┃ ┏━┻┓ ┊ 0.07┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ 4 ┊ ┃ ┃ 4 ┊ ┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┊ ┃ ┃ ┏┻┓ ┊ 0.00┊ 0 1 3 2 ┊ 0 1 2 3 ┊ 0 1 2 3 ┊ 0.00 2.00 7.00 10.00 Simplifies to 0.25┊ 4 ┊ ┊ ┊ ┊ ┃ ┊ ┊ ┊ 0.20┊ ┃ ┊ ┊ 3 ┊ ┊ ┃ ┊ ┊ ┏┻┓ ┊ 0.17┊ 2 ┊ 2 ┊ ┃ ┃ ┊ ┊ ┏┻┓ ┊ ┏┻┓ ┊ ┃ ┃ ┊ 0.00┊ 0 1 ┊ 0 1 ┊ 0 1 ┊ 0.00 2.00 7.00 10.00 */ int ret = 0; // clang-format off tsk_id_t parents[] = { 2, 2, 4, -1, -1, 2, 2, -1, -1, -1, 3, 3, -1, -1, -1, }; // clang-format on uint32_t num_trees = 3; tsk_id_t samples[] = { 0, 3 }; tsk_treeseq_t ts, simplified; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_simplify( &ts, samples, 2, TSK_SIMPLIFY_KEEP_INPUT_ROOTS, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_trees(&simplified, num_trees, parents); verify_edge_array_trees(&ts); tsk_treeseq_free(&ts); tsk_treeseq_free(&simplified); } static void test_left_to_right_multi_tree(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 0.090 0\n" "0 0.170 0\n" "0 0.253 0\n" "0 0.071 0\n" "0 0.202 0\n"; const char *edges = "2 10 7 2,3\n" "0 2 4 1\n" "2 10 4 1\n" "0 2 4 3\n" "2 10 4 7\n" "0 7 5 0,4\n" "7 10 8 0,4\n" "0 2 6 2,5\n"; const char *sites = "1 0\n" "4.5 0\n" "8.5 0\n"; const char *mutations = "0 2 1\n" "1 0 1\n" "2 4 1\n"; // clang-format off tsk_id_t parents[] = { 5, 4, 6, 4, 5, 6, TSK_NULL, TSK_NULL, TSK_NULL, 5, 4, 7, 7, 5, TSK_NULL, TSK_NULL, 4, TSK_NULL, 8, 4, 7, 7, 8, TSK_NULL, TSK_NULL, 4, TSK_NULL, }; // clang-format on tsk_treeseq_t ts; uint32_t num_trees = 3; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); verify_trees(&ts, num_trees, parents); verify_tree_next_prev(&ts); verify_edge_array_trees(&ts); tsk_treeseq_free(&ts); } static void test_gappy_multi_tree(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 0.090 0\n" "0 0.170 0\n" "0 0.253 0\n" "0 0.071 0\n" "0 0.202 0\n"; const char *edges = "2 7 7 2\n" "8 10 7 2\n" "2 7 7 3\n" "8 10 7 3\n" "1 2 4 1\n" "2 7 4 1\n" "8 10 4 1\n" "1 2 4 3\n" "2 7 4 7\n" "8 10 4 7\n" "1 7 5 0,4\n" "8 10 8 0,4\n" "1 2 6 2,5\n"; tsk_id_t z = TSK_NULL; // clang-format off tsk_id_t parents[] = { z, z, z, z, z, z, z, z, z, 5, 4, 6, 4, 5, 6, z, z, z, 5, 4, 7, 7, 5, z, z, 4, z, z, z, z, z, z, z, z, z, z, 8, 4, 7, 7, 8, z, z, 4, z, z, z, z, z, z, z, z, z, z, }; // clang-format on tsk_treeseq_t ts; uint32_t num_trees = 6; tsk_treeseq_from_text(&ts, 12, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); verify_trees(&ts, num_trees, parents); verify_tree_next_prev(&ts); verify_edge_array_trees(&ts); tsk_treeseq_free(&ts); } static void test_tsk_treeseq_bad_records(void) { int ret = 0; tsk_treeseq_t ts; tsk_table_collection_t tables; uint32_t num_trees = 3; // clang-format off tsk_id_t parents[] = { 6, 5, 8, 5, TSK_NULL, 6, 8, TSK_NULL, TSK_NULL, 6, 5, 4, 4, 5, 6, TSK_NULL, TSK_NULL, TSK_NULL, 7, 5, 4, 4, 5, 7, TSK_NULL, TSK_NULL, TSK_NULL, }; // clang-format on tsk_flags_t load_flags = TSK_TS_INIT_BUILD_INDEXES; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 10; parse_nodes(paper_ex_nodes, &tables.nodes); parse_edges(paper_ex_edges, &tables.edges); parse_individuals(paper_ex_individuals, &tables.individuals); /* Make sure we have a good set of records */ ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ts.num_trees, 3); verify_trees(&ts, num_trees, parents); tsk_treeseq_free(&ts); /* Left value greater than right */ tables.edges.left[0] = 10.0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_EDGE_INTERVAL); tsk_treeseq_free(&ts); tables.edges.left[0] = 2.0; ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, 0); verify_trees(&ts, num_trees, parents); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_convenience_arrays_multi_tree(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; tsk_treeseq_from_text( &ts, 10, unary_ex_nodes, unary_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); verify_edge_array_trees(&ts); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_tree_next(&t)); CU_ASSERT_EQUAL(t.num_children[8], 2); CU_ASSERT_TRUE(tsk_tree_next(&t)); CU_ASSERT_EQUAL(t.num_children[8], 1); CU_ASSERT_TRUE(tsk_tree_next(&t)); CU_ASSERT_EQUAL(t.num_children[8], 0); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_multiroot_mrca(void) { int ret; tsk_treeseq_t ts; tsk_tree_t tree; tsk_id_t mrca; tsk_treeseq_from_text(&ts, 10, multiroot_ex_nodes, multiroot_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&tree, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&tree); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_mrca(&tree, 0, 0, &mrca), 0); CU_ASSERT_EQUAL(mrca, 0); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_mrca(&tree, 0, 1, &mrca), 0); CU_ASSERT_EQUAL(mrca, 10); /* MRCA of two nodes in different subtrees is TSK_NULL */ CU_ASSERT_EQUAL_FATAL(tsk_tree_get_mrca(&tree, 0, 2, &mrca), 0); CU_ASSERT_EQUAL(mrca, TSK_NULL); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_mrca(&tree, 2, 0, &mrca), 0); CU_ASSERT_EQUAL(mrca, TSK_NULL); tsk_tree_free(&tree); tsk_treeseq_free(&ts); } /*======================================================= * Sample sets *======================================================*/ static void test_simple_sample_sets(void) { // clang-format off sample_count_test_t tests[] = { {0, 0, 1}, {0, 5, 2}, {0, 6, 3}, {1, 4, 2}, {1, 5, 3}, {1, 6, 4}}; // clang-format on uint32_t num_tests = 6; tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, NULL, NULL, paper_ex_individuals, NULL, 0); verify_sample_counts(&ts, num_tests, tests, 0); verify_sample_counts(&ts, num_tests, tests, TSK_SEEK_SKIP); verify_sample_sets(&ts); tsk_treeseq_free(&ts); } static void test_nonbinary_sample_sets(void) { // clang-format off sample_count_test_t tests[] = { {0, 0, 1}, {0, 8, 4}, {0, 9, 5}, {0, 10, 3}, {0, 12, 8}, {1, 5, 1}, {1, 8, 4}, {1, 9, 5}, {0, 10, 2}, {0, 11, 1}}; // clang-format on uint32_t num_tests = 8; tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); verify_sample_counts(&ts, num_tests, tests, 0); verify_sample_counts(&ts, num_tests, tests, TSK_SEEK_SKIP); verify_sample_sets(&ts); tsk_treeseq_free(&ts); } static void test_internal_sample_sample_sets(void) { // clang-format off sample_count_test_t tests[] = { {0, 0, 1}, {0, 5, 4}, {0, 4, 2}, {0, 7, 5}, {1, 4, 2}, {1, 5, 4}, {1, 8, 5}, {2, 5, 4}, {2, 6, 5}}; // clang-format on uint32_t num_tests = 9; tsk_treeseq_t ts; tsk_treeseq_from_text(&ts, 10, internal_sample_ex_nodes, internal_sample_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); verify_sample_counts(&ts, num_tests, tests, 0); verify_sample_counts(&ts, num_tests, tests, TSK_SEEK_SKIP); verify_sample_sets(&ts); tsk_treeseq_free(&ts); } static void test_non_sample_leaf_sample_lists(void) { const char *nodes = "1 0 0\n" "0 0 0\n" "1 2 0\n"; const char *edges = "0 1 2 0,1\n"; const tsk_id_t left_sample[3] = { 0, -1, 1 }; const tsk_id_t right_sample[3] = { 0, -1, 0 }; const tsk_id_t next_sample[2] = { -1, 0 }; const tsk_id_t samples[2] = { 0, 2 }; const tsk_id_t sample_index_map[3] = { 0, -1, 1 }; tsk_treeseq_t ts; tsk_tree_t t; tsk_id_t i; int ret; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); for (i = 0; i < 3; i++) { CU_ASSERT_EQUAL_FATAL(left_sample[i], t.left_sample[i]); CU_ASSERT_EQUAL_FATAL(right_sample[i], t.right_sample[i]); CU_ASSERT_EQUAL_FATAL(sample_index_map[i], ts.sample_index_map[i]); } for (i = 0; i < 2; i++) { CU_ASSERT_EQUAL_FATAL(next_sample[i], t.next_sample[i]); CU_ASSERT_EQUAL_FATAL(samples[i], t.samples[i]); } tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_virtual_root_properties(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; int depth; double time, length; tsk_id_t node; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_depth(&t, t.virtual_root, &depth), 0) CU_ASSERT_EQUAL_FATAL(depth, -1); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_time(&t, t.virtual_root, &time), 0) /* Workaround problems in IEEE floating point macros. We may want to * add tsk_isinf (like tsk_isnan) at some point, but not worth it just * for this test case */ CU_ASSERT_TRUE(isinf((float) time)); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_mrca(&t, t.virtual_root, 0, &node), 0) CU_ASSERT_EQUAL(node, t.virtual_root); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_mrca(&t, 0, t.virtual_root, &node), 0) CU_ASSERT_EQUAL(node, t.virtual_root); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_parent(&t, t.virtual_root, &node), 0) CU_ASSERT_EQUAL(node, TSK_NULL); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_branch_length(&t, t.virtual_root, &length), 0) CU_ASSERT_EQUAL(length, 0); /* The definition of "descendant" is that node v is on the path from * u to a root. Since there is no parent link from roots to the * virtual_root, it's consistent with this definition to return false * for every node. */ CU_ASSERT_FALSE(tsk_tree_is_descendant(&t, 0, t.virtual_root)); CU_ASSERT_FALSE( tsk_tree_is_descendant(&t, t.left_child[t.virtual_root], t.virtual_root)); CU_ASSERT_FALSE(tsk_tree_is_descendant(&t, t.virtual_root, 0)); /* The virtual_root *is* a descendent of itself, though. This is * consistent with other nodes that are not "in" the tree being * descendents of themselves, despite not being roots in the tree. */ CU_ASSERT_TRUE(tsk_tree_is_descendant(&t, t.virtual_root, t.virtual_root)); CU_ASSERT_FALSE(tsk_tree_is_sample(&t, t.virtual_root)); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&t), 1); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_no_sample_count_semantics(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; tsk_id_t nodes; tsk_size_t n; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, TSK_NO_SAMPLE_COUNTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(tsk_tree_get_num_roots(&t), 0); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&t), TSK_NULL); CU_ASSERT_EQUAL(tsk_tree_preorder(&t, &nodes, &n), TSK_ERR_UNSUPPORTED_OPERATION); CU_ASSERT_EQUAL(tsk_tree_postorder(&t, &nodes, &n), TSK_ERR_UNSUPPORTED_OPERATION); CU_ASSERT_EQUAL(tsk_tree_preorder_samples_from(&t, -1, &nodes, &n), TSK_ERR_UNSUPPORTED_OPERATION); CU_ASSERT_EQUAL(tsk_tree_preorder_from(&t, t.virtual_root, &nodes, &n), TSK_ERR_UNSUPPORTED_OPERATION); CU_ASSERT_EQUAL(tsk_tree_postorder_from(&t, t.virtual_root, &nodes, &n), TSK_ERR_UNSUPPORTED_OPERATION); CU_ASSERT_EQUAL(tsk_tree_preorder_samples_from(&t, t.virtual_root, &nodes, &n), TSK_ERR_UNSUPPORTED_OPERATION); tsk_tree_free(&t); tsk_treeseq_free(&ts); } /*======================================================= * Tree traversals *=======================================================*/ static void verify_node_lists(tsk_size_t n, tsk_id_t *l1, tsk_id_t *l2) { tsk_size_t j; for (j = 0; j < n; j++) { /* printf("%d %d\n", l1[j], l2[j]); */ CU_ASSERT_EQUAL(l1[j], l2[j]); } } static void test_single_tree_traversal(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; tsk_size_t num_nodes = 7; tsk_id_t preorder[] = { 6, 4, 0, 1, 5, 2, 3 }; tsk_id_t preorder_vr[] = { 7, 6, 4, 0, 1, 5, 2, 3 }; tsk_id_t preorder_samples[] = { 0, 1, 2, 3 }; tsk_id_t postorder[] = { 0, 1, 4, 2, 3, 5, 6 }; tsk_id_t postorder_vr[] = { 0, 1, 4, 2, 3, 5, 6, 7 }; tsk_id_t nodes[num_nodes + 1]; tsk_size_t n; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_preorder(&t, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, num_nodes); verify_node_lists(n, nodes, preorder); ret = tsk_tree_preorder_from(&t, -1, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, num_nodes); verify_node_lists(n, nodes, preorder); ret = tsk_tree_preorder_from(&t, t.virtual_root, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, num_nodes + 1); verify_node_lists(n, nodes, preorder_vr); ret = tsk_tree_preorder_samples_from(&t, -1, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 4); verify_node_lists(n, nodes, preorder_samples); ret = tsk_tree_preorder_samples_from(&t, t.virtual_root, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 4); verify_node_lists(n, nodes, preorder_samples); ret = tsk_tree_preorder_from(&t, 5, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 3); verify_node_lists(n, nodes, preorder + 4); ret = tsk_tree_preorder_samples_from(&t, 5, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 2); verify_node_lists(n, nodes, preorder_samples + 2); ret = tsk_tree_postorder(&t, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, num_nodes); verify_node_lists(n, nodes, postorder); ret = tsk_tree_postorder_from(&t, -1, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, num_nodes); verify_node_lists(n, nodes, postorder); ret = tsk_tree_postorder_from(&t, t.virtual_root, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, num_nodes + 1); verify_node_lists(n, nodes, postorder_vr); ret = tsk_tree_postorder_from(&t, 4, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 3); verify_node_lists(n, nodes, postorder); /* Check errors */ ret = tsk_tree_preorder_from(&t, -2, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_preorder_from(&t, 8, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_preorder_samples_from(&t, -2, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_preorder_samples_from(&t, 8, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_postorder_from(&t, -2, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_postorder_from(&t, 8, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_tree_free(&t); tsk_treeseq_free(&ts); } /* printed out in tree order. 0.90┊ ┊ 11 ┊ ┊ ┊ ┊ ┏┻┓ ┊ ┊ 0.80┊ 10 ┊ ┃ ┃ ┊ ┊ ┊ ┏┻┓ ┊ ┃ ┃ ┊ ┊ 0.40┊ 9 ┃ ┃ ┊ 9 ┃ ┃ ┊ 9 ┊ ┊ ┏━┻┓ ┃ ┃ ┊ ┏━┻━┓ ┃ ┃ ┊ ┏━┻━━┓ ┊ 0.30┊ ┃ ┃ ┃ ┃ ┊ ┃ 8 ┃ ┃ ┊ ┃ 8 ┊ ┊ ┃ ┃ ┃ ┃ ┊ ┃ ┏┻┓ ┃ ┃ ┊ ┃ ┏┻┓ ┊ 0.20┊ ┃ 7 ┃ ┃ ┊ 7 ┃ ┃ ┃ ┃ ┊ 7 ┃ ┃ ┊ ┊ ┃ ┏┻┓ ┃ ┃ ┊ ┏┻┓ ┃ ┃ ┃ ┃ ┊ ┏━┻┓ ┃ ┃ ┊ 0.10┊ ┃ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┃ ┃ ┃ ┃ ┊ ┃ 6 ┃ ┃ ┊ ┊ ┃ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┃ ┃ ┃ ┃ ┊ ┃ ┏┻┓ ┃ ┃ ┊ 0.00┊ 5 2 3 4 0 1 ┊ 3 4 1 2 0 5 ┊ 4 0 3 1 2 5 ┊ 0 4 8 10 */ static void test_multiroot_tree_traversal(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; tsk_id_t preorder[] = { 5, 9, 2, 7, 3, 4, 10, 0, 1 }; tsk_id_t preorder_vr[] = { 12, 5, 9, 2, 7, 3, 4, 10, 0, 1 }; tsk_id_t preorder_samples[] = { 5, 2, 3, 4, 0, 1 }; tsk_id_t postorder[] = { 5, 2, 3, 4, 7, 9, 0, 1, 10 }; tsk_id_t postorder_vr[] = { 5, 2, 3, 4, 7, 9, 0, 1, 10, 12 }; tsk_id_t nodes[13]; tsk_size_t n; tsk_treeseq_from_text(&ts, 10, multiroot_ex_nodes, multiroot_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_preorder(&t, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 9); verify_node_lists(n, nodes, preorder); ret = tsk_tree_preorder_from(&t, -1, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 9); verify_node_lists(n, nodes, preorder); ret = tsk_tree_preorder_from(&t, t.virtual_root, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 10); verify_node_lists(n, nodes, preorder_vr); ret = tsk_tree_preorder_from(&t, 10, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 3); verify_node_lists(n, nodes, preorder + 6); ret = tsk_tree_preorder_samples_from(&t, -1, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 6); verify_node_lists(n, nodes, preorder_samples); ret = tsk_tree_preorder_samples_from(&t, t.virtual_root, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 6); verify_node_lists(n, nodes, preorder_samples); ret = tsk_tree_preorder_samples_from(&t, 5, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 1); verify_node_lists(n, nodes, preorder_samples); ret = tsk_tree_preorder_samples_from(&t, 10, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 2); verify_node_lists(n, nodes, preorder_samples + 4); ret = tsk_tree_postorder(&t, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 9); verify_node_lists(n, nodes, postorder); ret = tsk_tree_postorder_from(&t, -1, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 9); verify_node_lists(n, nodes, postorder); ret = tsk_tree_postorder_from(&t, t.virtual_root, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 10); verify_node_lists(n, nodes, postorder_vr); ret = tsk_tree_postorder_from(&t, 10, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 3); verify_node_lists(n, nodes, postorder + 6); /* Nodes that aren't "in" the tree have singleton traversal lists and * connect to no samples */ ret = tsk_tree_preorder_from(&t, 11, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 1); CU_ASSERT_EQUAL_FATAL(nodes[0], 11); ret = tsk_tree_postorder_from(&t, 11, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 1); CU_ASSERT_EQUAL_FATAL(nodes[0], 11); ret = tsk_tree_preorder_samples_from(&t, 11, nodes, &n); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(n, 0); tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void verify_seek_multi_tree(tsk_flags_t seek_options) { int ret; tsk_treeseq_t ts; tsk_tree_t t; double breakpoints[] = { 0, 2, 7, 10 }; tsk_id_t num_trees = 3; tsk_id_t j, k; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, NULL, NULL, paper_ex_individuals, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_trees; j++) { ret = tsk_tree_seek(&t, breakpoints[j], seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.index, j); ret = tsk_tree_seek_index(&t, j, seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.index, j); for (k = 0; k < num_trees; k++) { ret = tsk_tree_seek(&t, breakpoints[k], seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.index, k); ret = tsk_tree_seek_index(&t, k, seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.index, k); } } ret = tsk_tree_seek(&t, 1.99999, seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.index, 0); ret = tsk_tree_seek(&t, 6.99999, seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.index, 1); ret = tsk_tree_seek(&t, 9.99999, seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.index, 2); tsk_tree_free(&t); /* Seek to all positions from a new tree. */ for (j = 0; j < num_trees; j++) { ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_seek(&t, breakpoints[j], seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.index, j); tsk_tree_free(&t); } /* Seek to all positions from a non-new tree in the null state*/ ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_trees; j++) { ret = tsk_tree_seek(&t, 0, seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_prev(&t); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.index, -1); ret = tsk_tree_seek(&t, breakpoints[j], seek_options); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(t.index, j); } tsk_tree_free(&t); tsk_treeseq_free(&ts); } static void test_seek_multi_tree(void) { verify_seek_multi_tree(0); verify_seek_multi_tree(TSK_SEEK_SKIP); } static void test_seek_errors(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, NULL, NULL, paper_ex_individuals, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_seek(&t, -1, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SEEK_OUT_OF_BOUNDS); ret = tsk_tree_seek(&t, 10, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SEEK_OUT_OF_BOUNDS); ret = tsk_tree_seek(&t, 11, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SEEK_OUT_OF_BOUNDS); ret = tsk_tree_seek_index(&t, (tsk_id_t) ts.num_trees, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SEEK_OUT_OF_BOUNDS); ret = tsk_tree_seek_index(&t, -1, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SEEK_OUT_OF_BOUNDS); tsk_tree_free(&t); tsk_treeseq_free(&ts); } /*======================================================= * KC Distance tests. *=======================================================*/ static void test_isolated_node_kc(void) { const char *single_leaf = "1 0 0"; const char *single_internal = "0 0 0"; const char *edges = ""; tsk_treeseq_t ts; tsk_tree_t t; int ret; double result = 0; tsk_treeseq_from_text(&ts, 1, single_leaf, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_kc_distance(&ts, &ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_kc_distance(&t, &t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); tsk_treeseq_free(&ts); tsk_tree_free(&t); tsk_treeseq_from_text( &ts, 1, single_internal, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_kc_distance(&ts, &ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_ROOTS); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_left_root(&t), TSK_NULL); ret = tsk_tree_kc_distance(&t, &t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_ROOTS); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_single_tree_kc(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t, other_t; double result = 0; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_kc_distance(&ts, &ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); ret = tsk_treeseq_kc_distance(&ts, &ts, 1, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_init(&other_t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&other_t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_copy(&t, &other_t, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); check_trees_identical(&t, &other_t); ret = tsk_tree_kc_distance(&t, &other_t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); ret = tsk_tree_kc_distance(&t, &other_t, 1, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); tsk_treeseq_free(&ts); tsk_tree_free(&t); tsk_tree_free(&other_t); } static void test_two_trees_kc(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 2 0\n" "0 3 0\n"; const char *nodes_other = "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 4 0\n" "0 6 0\n"; const char *edges = "0 1 3 0,1\n" "0 1 4 2,3\n"; int ret; tsk_treeseq_t ts, other_ts; tsk_tree_t t, other_t; double result = 0; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); tsk_treeseq_from_text( &other_ts, 1, nodes_other, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_kc_distance(&ts, &other_ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); ret = tsk_treeseq_kc_distance(&ts, &other_ts, 1, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(result, 4.243, 1e-2); ret = tsk_tree_init(&other_t, &other_ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&other_t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_kc_distance(&t, &other_t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); ret = tsk_tree_kc_distance(&t, &other_t, 1, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(result, 4.243, 1e-2); tsk_treeseq_free(&ts); tsk_treeseq_free(&other_ts); tsk_tree_free(&t); tsk_tree_free(&other_t); } static void test_empty_tree_kc(void) { tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_tree_t t; tsk_id_t v; int ret; double result = 0; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH); tsk_treeseq_free(&ts); tables.sequence_length = NAN; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH); tsk_treeseq_free(&ts); tables.sequence_length = INFINITY; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH); tsk_treeseq_free(&ts); tables.sequence_length = 1.0; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_empty_tree_sequence(&ts, 1.0); ret = tsk_treeseq_kc_distance(&ts, &ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_ROOTS); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_left_root(&t), TSK_NULL); CU_ASSERT_EQUAL_FATAL(t.interval.left, 0); CU_ASSERT_EQUAL_FATAL(t.interval.right, 1); CU_ASSERT_EQUAL_FATAL(t.parent[0], TSK_NULL); CU_ASSERT_EQUAL_FATAL(t.left_child[0], TSK_NULL); CU_ASSERT_EQUAL_FATAL(t.right_child[0], TSK_NULL); CU_ASSERT_EQUAL_FATAL(t.left_sib[0], TSK_NULL); CU_ASSERT_EQUAL_FATAL(t.right_sib[0], TSK_NULL); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_parent(&t, 1, &v), TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_kc_distance(&t, &t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_ROOTS); tsk_tree_free(&t); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_nonbinary_tree_kc(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges = "0 1 4 0,1,2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; int ret; double result = 0; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); tsk_treeseq_kc_distance(&ts, &ts, 0, &result); CU_ASSERT_EQUAL_FATAL(result, 0); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); tsk_tree_kc_distance(&t, &t, 0, &result); CU_ASSERT_EQUAL_FATAL(result, 0); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_nonzero_samples_kc(void) { const char *nodes = "0 0 0\n" /* unused node at the start */ "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges = "0 1 3 1,2\n"; tsk_treeseq_t ts; tsk_tree_t t; int ret; double result = 0; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_kc_distance(&ts, &ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_kc_distance(&t, &t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_internal_samples_kc(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 1 0"; const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts; tsk_tree_t t; int ret; double result = 0; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); /* Permitted in tree sequences */ ret = tsk_treeseq_kc_distance(&ts, &ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0.0); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_kc_distance(&t, &t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_non_sample_leaf_kc(void) { const char *nodes = "1 0 0\n" "0 0 0\n" "0 1 0\n"; const char *edges = "0 1 2 0,1\n"; tsk_treeseq_t ts; tsk_tree_t t; int ret; double result = 0; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_kc_distance(&ts, &ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0.0); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_kc_distance(&t, &t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0.0); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_unequal_sample_size_kc(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 2 0\n" "0 3 0\n"; const char *nodes_other = "1 0 0\n" "1 0 0\n" "0 1 0\n"; const char *edges = "0 1 3 0,1\n" "0 1 4 2,3\n"; const char *edges_other = "0 1 2 0,1\n"; int ret; tsk_treeseq_t ts, other_ts; tsk_tree_t t, other_t; double result = 0; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); tsk_treeseq_from_text( &other_ts, 1, nodes_other, edges_other, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_kc_distance(&ts, &other_ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SAMPLE_SIZE_MISMATCH); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_init(&other_t, &other_ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&other_t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_kc_distance(&t, &other_t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SAMPLE_SIZE_MISMATCH); tsk_treeseq_free(&ts); tsk_treeseq_free(&other_ts); tsk_tree_free(&t); tsk_tree_free(&other_t); } static void test_unequal_samples_kc(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 2 0\n" "0 3 0\n"; const char *nodes_other = "0 0 0\n" /* Unused node at the start */ "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 2 0\n" "0 3 0\n"; const char *edges = "0 1 3 0,1\n" "0 1 4 2,3\n"; const char *edges_other = "0 1 4 1,2\n" "0 1 5 3,4\n"; int ret; tsk_treeseq_t ts, other_ts; tsk_tree_t t, other_t; double result = 0; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); tsk_treeseq_from_text( &other_ts, 1, nodes_other, edges_other, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_kc_distance(&ts, &other_ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SAMPLES_NOT_EQUAL); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_init(&other_t, &other_ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&other_t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_kc_distance(&t, &other_t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SAMPLES_NOT_EQUAL); tsk_treeseq_free(&ts); tsk_treeseq_free(&other_ts); tsk_tree_free(&t); tsk_tree_free(&other_t); } static void test_unary_nodes_kc(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 2 0"; const char *edges = "0 1 2 0,1\n" "0 1 3 2"; tsk_treeseq_t ts; tsk_tree_t t; int ret; double result = 0; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_kc_distance(&t, &t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNARY_NODES); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_no_sample_lists_kc(void) { tsk_treeseq_t ts; tsk_tree_t t; int ret = 0; double result = 0; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_kc_distance(&t, &t, 9, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NO_SAMPLE_LISTS); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_unequal_sequence_lengths_kc(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 2 0\n" "0 3 0\n"; const char *edges_1 = "0 1 3 0,1\n" "0 1 4 2,3\n"; const char *edges_2 = "0 2 3 0,1\n" "0 2 4 2,3\n"; tsk_treeseq_t ts, other; int ret; double result = 0; tsk_treeseq_from_text(&ts, 1, nodes, edges_1, NULL, NULL, NULL, NULL, NULL, 0); tsk_treeseq_from_text(&other, 2, nodes, edges_2, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_kc_distance(&ts, &other, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SEQUENCE_LENGTH_MISMATCH); tsk_treeseq_free(&ts); tsk_treeseq_free(&other); } static void test_different_number_trees_kc(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 2 0\n" "0 3 0\n" "0 4 0\n" "0 5 0\n"; const char *edges = "0 10 5 0,1\n" "0 10 6 3,4\n" "5 10 7 2,5\n" "0 5 8 2\n" "0 10 8 6\n" "5 10 8 7\n" "0 5 9 5,8\n"; const char *other_nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0\n" "0 2 0\n" "0 3 0\n" "0 4 0\n"; const char *other_edges = "0 10 5 0,1\n" "0 10 6 2,3\n" "0 10 7 4,5\n" "0 10 8 6,7\n"; tsk_treeseq_t ts, other; double result, expected; int ret = 0; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); tsk_treeseq_from_text( &other, 10, other_nodes, other_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_treeseq_kc_distance(&ts, &other, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); expected = (sqrt(8.0) * 5.0 + sqrt(6.0) * 5.0) / 10.0; CU_ASSERT_DOUBLE_EQUAL_FATAL(result, expected, 1e-2); tsk_treeseq_free(&ts); tsk_treeseq_free(&other); } static void test_offset_trees_with_errors_kc(void) { const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 2 0\n" "0 3 0\n" "0 4 0\n"; const char *edges = "0 10 4 0,1\n" "0 10 5 2,3\n" "0 10 6 4,5\n"; tsk_treeseq_t ts, other; double result; int ret = 0; tsk_treeseq_from_text( &ts, 10, unary_ex_nodes, unary_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); tsk_treeseq_from_text(&other, 10, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 10); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&other), 10); ret = tsk_treeseq_kc_distance(&ts, &other, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNARY_NODES); ret = tsk_treeseq_kc_distance(&other, &ts, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNARY_NODES); tsk_treeseq_free(&ts); tsk_treeseq_free(&other); } /*======================================================= * Miscellaneous tests. *======================================================*/ static void test_genealogical_nearest_neighbours_errors(void) { int ret; tsk_treeseq_t ts; const tsk_id_t *reference_sets[2]; tsk_id_t reference_set_0[4], reference_set_1[4]; tsk_id_t focal[] = { 0, 1, 2, 3 }; tsk_size_t reference_set_size[2]; tsk_size_t num_focal = 4; double *A = tsk_malloc(2 * num_focal * sizeof(double)); CU_ASSERT_FATAL(A != NULL); tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 4); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 0, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, INT16_MAX, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Overlapping sample sets */ reference_sets[0] = focal; reference_set_size[0] = 1; reference_sets[1] = focal; reference_set_size[1] = num_focal; ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); /* bad values in the sample sets */ reference_set_0[0] = 0; reference_set_0[1] = 1; reference_set_1[0] = 2; reference_set_1[1] = 3; reference_set_size[0] = 2; reference_set_size[1] = 2; reference_sets[0] = reference_set_0; reference_sets[1] = reference_set_1; ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, 0); reference_set_0[0] = -1; ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); reference_set_0[0] = (tsk_id_t) tsk_treeseq_get_num_nodes(&ts); ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); reference_set_0[0] = (tsk_id_t) tsk_treeseq_get_num_nodes(&ts) + 1; ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); /* Duplicate values in the focal sets */ reference_set_0[0] = 1; ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); reference_set_0[0] = 3; ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE); /* Bad sample ID */ reference_sets[0] = focal; reference_set_size[0] = 1; reference_sets[1] = focal + 1; reference_set_size[1] = num_focal - 1; focal[0] = -1; ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); focal[0] = (tsk_id_t) tsk_treeseq_get_num_nodes(&ts); ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); focal[0] = (tsk_id_t) tsk_treeseq_get_num_nodes(&ts) + 100; ret = tsk_treeseq_genealogical_nearest_neighbours( &ts, focal, num_focal, reference_sets, reference_set_size, 2, 0, A); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); free(A); } static void test_single_tree_balance(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; tsk_size_t sackin, colless; double b1, b2; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); /* Balanced binary tree with 4 leaves */ CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0); CU_ASSERT_EQUAL(sackin, 8); CU_ASSERT_EQUAL_FATAL(tsk_tree_colless_index(&t, &colless), 0); CU_ASSERT_EQUAL(colless, 0); CU_ASSERT_EQUAL_FATAL(tsk_tree_b1_index(&t, &b1), 0); CU_ASSERT_DOUBLE_EQUAL(b1, 2, 1e-8); /* Test different bases for b2_index to high-precision */ CU_ASSERT_EQUAL_FATAL(tsk_tree_b2_index(&t, 10, &b2), 0); CU_ASSERT_DOUBLE_EQUAL(b2, 0.6020599913279623, 1e-14); CU_ASSERT_EQUAL_FATAL(tsk_tree_b2_index(&t, 2, &b2), 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(b2, 2, 1e-16); CU_ASSERT_EQUAL_FATAL(tsk_tree_b2_index(&t, 3, &b2), 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(b2, 1.2618595071429148, 1e-14); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_multiroot_balance(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; tsk_size_t sackin; double b1; tsk_treeseq_from_text(&ts, 10, multiroot_ex_nodes, multiroot_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); /* 0.80┊ 10 */ /* ┊ ┏┻┓ */ /* 0.40┊ 9 ┃ ┃ */ /* ┊ ┏━┻┓ ┃ ┃ */ /* 0.30┊ ┃ ┃ ┃ ┃ */ /* ┊ ┃ ┃ ┃ ┃ */ /* 0.20┊ ┃ 7 ┃ ┃ */ /* ┊ ┃ ┏┻┓ ┃ ┃ */ /* 0.10┊ ┃ ┃ ┃ ┃ ┃ */ /* ┊ ┃ ┃ ┃ ┃ ┃ */ /* 0.00┊ 5 2 3 4 0 1 */ CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0); CU_ASSERT_EQUAL(sackin, 7); CU_ASSERT_EQUAL_FATAL(tsk_tree_colless_index(&t, NULL), TSK_ERR_UNDEFINED_MULTIROOT); CU_ASSERT_EQUAL_FATAL(tsk_tree_b1_index(&t, &b1), 0); CU_ASSERT_DOUBLE_EQUAL(b1, 1.0, 1e-8); CU_ASSERT_EQUAL_FATAL(tsk_tree_b2_index(&t, 10, NULL), TSK_ERR_UNDEFINED_MULTIROOT); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_nonbinary_balance(void) { int ret; const char *nodes = "1 0 0\n" "1 0 0\n" "1 0 0\n" "1 0 0\n" "0 1 0"; const char *edges = "0 1 4 0,1,2,3\n"; tsk_treeseq_t ts; tsk_tree_t t; tsk_size_t sackin, colless; double b1, b2; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); /* Star tree with 4 leaves */ CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0); CU_ASSERT_EQUAL(sackin, 4); CU_ASSERT_EQUAL_FATAL( tsk_tree_colless_index(&t, &colless), TSK_ERR_UNDEFINED_NONBINARY); CU_ASSERT_EQUAL_FATAL(tsk_tree_b1_index(&t, &b1), 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(b1, 0, 1e-8); CU_ASSERT_EQUAL_FATAL(tsk_tree_b2_index(&t, 10, &b2), 0); CU_ASSERT_DOUBLE_EQUAL_FATAL(b1, 0, 1e-8); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_empty_tree_balance(void) { int ret; tsk_table_collection_t tables; tsk_treeseq_t ts; tsk_tree_t t; tsk_size_t sackin, colless; double b1, b2; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0); CU_ASSERT_EQUAL(sackin, 0); /* Technically wrong here because we have 0 roots, but not worth worrying about */ CU_ASSERT_EQUAL_FATAL( tsk_tree_colless_index(&t, &colless), TSK_ERR_UNDEFINED_MULTIROOT); CU_ASSERT_EQUAL_FATAL(tsk_tree_b1_index(&t, &b1), 0); CU_ASSERT_EQUAL(b1, 0); CU_ASSERT_EQUAL_FATAL(tsk_tree_b2_index(&t, 10, &b2), TSK_ERR_UNDEFINED_MULTIROOT); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_b2_bad_base(void) { int ret; tsk_treeseq_t ts; tsk_tree_t t; double result; double bad_base[] = { -2, -1, 1 }; size_t j; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); for (j = 0; j < sizeof(bad_base) / sizeof(*bad_base); j++) { ret = tsk_tree_b2_index(&t, bad_base[j], &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_isfinite(result)); } CU_ASSERT_FATAL(j > 0); /* this one is peculiar, in that base 0 seems to give a finite answer */ ret = tsk_tree_b2_index(&t, 0, &result); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(result, 0); tsk_treeseq_free(&ts); tsk_tree_free(&t); } static void test_tree_errors(void) { int ret; tsk_size_t j; tsk_id_t num_nodes = 9; tsk_id_t u; tsk_node_t node; tsk_treeseq_t ts, other_ts; tsk_tree_t t, other_t; tsk_id_t bad_nodes[] = { num_nodes + 1, num_nodes + 2, -1 }; tsk_id_t tracked_samples[] = { 0, 0, 0 }; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, NULL, NULL, paper_ex_individuals, NULL, 0); ret = tsk_tree_init(&t, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); /* Out-of-bounds queries */ for (j = 0; j < sizeof(bad_nodes) / sizeof(tsk_id_t); j++) { u = bad_nodes[j]; ret = tsk_tree_get_parent(&t, u, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_get_time(&t, u, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_get_branch_length(&t, u, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_get_mrca(&t, u, 0, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_get_mrca(&t, 0, u, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_get_num_samples(&t, u, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_tree_get_num_tracked_samples(&t, u, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); /* Also check tree sequence methods */ ret = tsk_treeseq_get_node(&ts, (tsk_id_t) u, &node); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); CU_ASSERT(!tsk_treeseq_is_sample(&ts, u)); CU_ASSERT(!tsk_tree_is_sample(&t, u)); } tracked_samples[0] = 0; tracked_samples[1] = (tsk_id_t) tsk_treeseq_get_num_samples(&ts); ret = tsk_tree_set_tracked_samples(&t, 2, tracked_samples); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_SAMPLES); tracked_samples[1] = (tsk_id_t) tsk_treeseq_get_num_nodes(&ts); ret = tsk_tree_set_tracked_samples(&t, 2, tracked_samples); CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); tracked_samples[1] = 0; ret = tsk_tree_set_tracked_samples(&t, 2, tracked_samples); CU_ASSERT_EQUAL(ret, TSK_ERR_DUPLICATE_SAMPLE); tsk_treeseq_from_text(&other_ts, 10, paper_ex_nodes, paper_ex_edges, NULL, NULL, NULL, paper_ex_individuals, NULL, 0); ret = tsk_tree_init(&other_t, &other_ts, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_copy(&t, &other_t, TSK_NO_INIT); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); tsk_tree_free(&t); tsk_tree_free(&other_t); ret = tsk_tree_init(&t, &other_ts, TSK_NO_SAMPLE_COUNTS); CU_ASSERT_EQUAL(ret, 0); ret = tsk_tree_copy(&t, &other_t, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_UNSUPPORTED_OPERATION); tsk_tree_free(&other_t); ret = tsk_tree_copy(&t, &other_t, TSK_SAMPLE_LISTS); CU_ASSERT_EQUAL(ret, TSK_ERR_UNSUPPORTED_OPERATION); tsk_tree_free(&other_t); tsk_tree_free(&t); tsk_treeseq_free(&other_ts); tsk_treeseq_free(&ts); } static void test_treeseq_row_access_errors(void) { int ret; tsk_table_collection_t tables; tsk_treeseq_t ts; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_get_individual(&ts, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); ret = tsk_treeseq_get_node(&ts, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_treeseq_get_edge(&ts, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); ret = tsk_treeseq_get_migration(&ts, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); ret = tsk_treeseq_get_site(&ts, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); ret = tsk_treeseq_get_mutation(&ts, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); ret = tsk_treeseq_get_population(&ts, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = tsk_treeseq_get_provenance(&ts, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_treeseq_get_individuals_population_errors(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_treeseq_t ts; tsk_id_t output[2]; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, TSK_NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret_id = tsk_treeseq_get_individuals_population(&ts, output); CU_ASSERT_EQUAL_FATAL(ret_id, TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_treeseq_get_individuals_population(void) { int ret; tsk_id_t ret_id; int j; tsk_table_collection_t tables; tsk_treeseq_t ts; tsk_id_t output[4]; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; for (j = 0; j < 2; j++) { ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, (tsk_id_t) j); } for (j = 0; j < 4; j++) { ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, (tsk_id_t) j); } ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, 0, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 3.0, 1, 3, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 2); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 3); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, 0, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 4); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_get_individuals_population(&ts, output); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(output[0], TSK_NULL); CU_ASSERT_EQUAL_FATAL(output[1], 0); CU_ASSERT_EQUAL_FATAL(output[2], TSK_NULL); CU_ASSERT_EQUAL_FATAL(output[3], 1); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_treeseq_get_individuals_time_errors(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_treeseq_t ts; double output[2]; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.2, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.8, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_get_individuals_time(&ts, output); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_TIME_MISMATCH); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_treeseq_get_individuals_time(void) { int ret; tsk_id_t ret_id; int j; tsk_table_collection_t tables; tsk_treeseq_t ts; double output[4]; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; for (j = 0; j < 2; j++) { ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, j); } for (j = 0; j < 4; j++) { ret_id = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, j); } ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, 0, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 3.25, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 3.0, 1, 3, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 2); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 3.25, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 3); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1.25, 0, 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 4); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_get_individuals_time(&ts, output); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(output[0], 3.25); CU_ASSERT_EQUAL_FATAL(output[1], 1.25); CU_ASSERT_FATAL(tsk_is_unknown_time(output[2])); CU_ASSERT_EQUAL_FATAL(output[3], 3.0); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_tree_copy_flags(void) { int iret, ret; tsk_size_t j; tsk_treeseq_t ts; tsk_tree_t t, other_t; tsk_flags_t options[] = { 0, TSK_NO_SAMPLE_COUNTS, TSK_SAMPLE_LISTS, TSK_NO_SAMPLE_COUNTS | TSK_SAMPLE_LISTS }; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, NULL, NULL, paper_ex_individuals, NULL, 0); for (j = 0; j < sizeof(options) / sizeof(*options); j++) { ret = tsk_tree_init(&t, &ts, options[j]); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_init(&other_t, &ts, options[j]); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_copy(&t, &other_t, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); check_trees_identical(&t, &other_t); tsk_tree_free(&other_t); while ((iret = tsk_tree_next(&t)) == TSK_TREE_OK) { ret = tsk_tree_copy(&t, &other_t, options[j]); CU_ASSERT_EQUAL_FATAL(ret, 0); check_trees_identical(&t, &other_t); tsk_tree_free(&other_t); } CU_ASSERT_EQUAL_FATAL(iret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_copy(&t, &other_t, options[j]); CU_ASSERT_EQUAL_FATAL(ret, 0); while (true) { CU_ASSERT_EQUAL_FATAL(ret, 0); check_trees_identical(&t, &other_t); CU_ASSERT_EQUAL_FATAL(tsk_tree_next(&t), tsk_tree_next(&other_t)); if (t.index == -1) { break; } } ret = tsk_tree_last(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); ret = tsk_tree_copy(&t, &other_t, TSK_NO_INIT | options[j]); CU_ASSERT_EQUAL_FATAL(ret, 0); while (true) { CU_ASSERT_EQUAL_FATAL(ret, 0); check_trees_identical(&t, &other_t); CU_ASSERT_EQUAL_FATAL(tsk_tree_prev(&t), tsk_tree_prev(&other_t)); if (t.index == -1) { break; } } tsk_tree_free(&other_t); tsk_tree_free(&t); } tsk_treeseq_free(&ts); } static void test_deduplicate_sites(void) { int ret; // Modified from paper_ex const char *tidy_sites = "1 0\n" "4.5 0\n" "8.5 0\n"; const char *tidy_mutations = "0 2 1\n" "0 1 2\n" "0 6 3\n" "0 3 4\n" "1 0 1\n" "1 2 2\n" "1 4 3\n" "1 5 4\n" "2 5 1\n" "2 7 2\n" "2 1 3\n" "2 0 4\n"; const char *messy_sites = "1 0\n" "1 0\n" "1 0\n" "1 0\n" "4.5 0\n" "4.5 0\n" "4.5 0\n" "4.5 0\n" "8.5 0\n" "8.5 0\n" "8.5 0\n" "8.5 0\n"; const char *messy_mutations = "0 2 1\n" "1 1 2\n" "2 6 3\n" "3 3 4\n" "4 0 1\n" "5 2 2\n" "6 4 3\n" "7 5 4\n" "8 5 1\n" "9 7 2\n" "10 1 3\n" "11 0 4\n"; tsk_table_collection_t tidy, messy; ret = tsk_table_collection_init(&tidy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_init(&messy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); messy.sequence_length = 10; tidy.sequence_length = 10; parse_individuals(paper_ex_individuals, &tidy.individuals); parse_nodes(paper_ex_nodes, &tidy.nodes); parse_sites(tidy_sites, &tidy.sites); parse_mutations(tidy_mutations, &tidy.mutations); // test cleaning doesn't mess up the tidy one parse_individuals(paper_ex_individuals, &messy.individuals); parse_nodes(paper_ex_nodes, &messy.nodes); parse_sites(tidy_sites, &messy.sites); parse_mutations(tidy_mutations, &messy.mutations); ret = tsk_table_collection_deduplicate_sites(&messy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&tidy.sites, &messy.sites, 0)); CU_ASSERT_TRUE(tsk_mutation_table_equals(&tidy.mutations, &messy.mutations, 0)); tsk_site_table_clear(&messy.sites); tsk_mutation_table_clear(&messy.mutations); // test with the actual messy one parse_sites(messy_sites, &messy.sites); parse_mutations(messy_mutations, &messy.mutations); ret = tsk_table_collection_deduplicate_sites(&messy, 0); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_TRUE(tsk_site_table_equals(&tidy.sites, &messy.sites, 0)); CU_ASSERT_TRUE(tsk_mutation_table_equals(&tidy.mutations, &messy.mutations, 0)); tsk_table_collection_free(&tidy); tsk_table_collection_free(&messy); } static void test_deduplicate_sites_errors(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 10; ret_id = tsk_site_table_add_row(&tables.sites, 2, "A", 1, "m", 1); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_site_table_add_row(&tables.sites, 2, "TT", 2, "MM", 2); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_mutation_table_add_row(&tables.mutations, 0, 0, -1, 0, "T", 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /* Negative position */ tables.sites.position[0] = -1; ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_SITE_POSITION); tables.sites.position[0] = 2; /* unsorted position */ tables.sites.position[1] = 0.5; ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_UNSORTED_SITES); tables.sites.position[1] = 2; /* negative site ID */ tables.mutations.site[0] = -1; ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tables.mutations.site[0] = 0; /* site ID out of bounds */ tables.mutations.site[0] = 2; ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); tables.mutations.site[0] = 0; /* Bad offset in metadata */ tables.sites.metadata_offset[0] = 2; ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); tables.sites.metadata_offset[0] = 0; /* Bad length in metadata */ tables.sites.metadata_offset[2] = 100; ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); tables.sites.metadata_offset[2] = 3; /* Bad offset in ancestral_state */ tables.sites.ancestral_state_offset[0] = 2; ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); tables.sites.ancestral_state_offset[0] = 0; /* Bad length in ancestral_state */ tables.sites.ancestral_state_offset[2] = 100; ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); tables.sites.ancestral_state_offset[2] = 3; ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL(ret, 0); tsk_table_collection_free(&tables); } static void test_deduplicate_sites_zero_rows(void) { int ret; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(tables.sites.num_rows, 0) tsk_table_collection_free(&tables); } static void test_deduplicate_sites_multichar(void) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 10; ret_id = tsk_site_table_add_row(&tables.sites, 0, "AA", 1, "M", 1); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_site_table_add_row(&tables.sites, 0, "0", 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_site_table_add_row(&tables.sites, 1, "BBBBB", 5, "NNNNN", 5); CU_ASSERT_EQUAL_FATAL(ret_id, 2); ret_id = tsk_site_table_add_row(&tables.sites, 1, "0", 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 3); ret = tsk_table_collection_deduplicate_sites(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 2); CU_ASSERT_EQUAL_FATAL(tables.sites.position[0], 0); CU_ASSERT_EQUAL_FATAL(tables.sites.position[1], 1); CU_ASSERT_EQUAL_FATAL(tables.sites.ancestral_state[0], 'A'); CU_ASSERT_EQUAL_FATAL(tables.sites.ancestral_state_offset[1], 1); CU_ASSERT_EQUAL_FATAL(tables.sites.metadata[0], 'M'); CU_ASSERT_EQUAL_FATAL(tables.sites.metadata_offset[1], 1); CU_ASSERT_NSTRING_EQUAL(tables.sites.ancestral_state + 1, "BBBBB", 5); CU_ASSERT_EQUAL_FATAL(tables.sites.ancestral_state_offset[2], 6); CU_ASSERT_NSTRING_EQUAL(tables.sites.metadata + 1, "NNNNN", 5); CU_ASSERT_EQUAL_FATAL(tables.sites.metadata_offset[2], 6); tsk_table_collection_free(&tables); } static void test_empty_tree_sequence(void) { tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_tree_t t; tsk_id_t v; int ret; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH); tsk_treeseq_free(&ts); tables.sequence_length = 1.0; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); verify_empty_tree_sequence(&ts, 1.0); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_left_root(&t), TSK_NULL); CU_ASSERT_EQUAL_FATAL(t.interval.left, 0); CU_ASSERT_EQUAL_FATAL(t.interval.right, 1); CU_ASSERT_EQUAL_FATAL(t.num_edges, 0); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_parent(&t, 0, &v), 0); CU_ASSERT_EQUAL_FATAL(v, TSK_NULL); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_parent(&t, 1, &v), TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_tree_free(&t); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_last(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_left_root(&t), TSK_NULL); CU_ASSERT_EQUAL_FATAL(t.interval.left, 0); CU_ASSERT_EQUAL_FATAL(t.interval.right, 1); CU_ASSERT_EQUAL_FATAL(tsk_tree_get_parent(&t, 1, &v), TSK_ERR_NODE_OUT_OF_BOUNDS); tsk_tree_free(&t); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_zero_edges(void) { const char *nodes = "1 0 0\n" "1 0 0\n"; const char *edges = ""; const char *sites = "0.1 0\n" "0.2 0\n"; const char *mutations = "0 0 1\n" "1 1 1\n"; tsk_treeseq_t ts, tss; tsk_tree_t t; tsk_id_t samples, node_map; const tsk_id_t z = TSK_NULL; tsk_id_t parents[] = { z, z, }; int ret; tsk_treeseq_from_text(&ts, 2, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 2.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 1); tsk_treeseq_print_state(&ts, _devnull); verify_trees(&ts, 1, parents); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(t.interval.left, 0); CU_ASSERT_EQUAL(t.interval.right, 2); CU_ASSERT_EQUAL(t.num_edges, 0); CU_ASSERT_EQUAL(t.parent[0], TSK_NULL); CU_ASSERT_EQUAL(t.parent[1], TSK_NULL); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&t), 0); CU_ASSERT_EQUAL(t.left_sib[0], TSK_NULL); CU_ASSERT_EQUAL(t.right_sib[0], 1); tsk_tree_print_state(&t, _devnull); tsk_tree_free(&t); ret = tsk_tree_init(&t, &ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_last(&t); CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); CU_ASSERT_EQUAL(t.interval.left, 0); CU_ASSERT_EQUAL(t.interval.right, 2); CU_ASSERT_EQUAL(t.parent[0], TSK_NULL); CU_ASSERT_EQUAL(t.parent[1], TSK_NULL); CU_ASSERT_EQUAL(tsk_tree_get_left_root(&t), 0); CU_ASSERT_EQUAL(t.left_sib[0], TSK_NULL); CU_ASSERT_EQUAL(t.right_sib[0], 1); tsk_tree_print_state(&t, _devnull); tsk_tree_free(&t); /* We give pointers ot samples and node_map here as they must be non null */ ret = tsk_treeseq_simplify(&ts, &samples, 0, 0, &tss, &node_map); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&tss), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&tss), 2.0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&tss), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&tss), 2); CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&tss), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&tss), 1); tsk_treeseq_print_state(&ts, _devnull); tsk_treeseq_free(&ts); tsk_treeseq_free(&tss); } static void test_tree_sequence_metadata(void) { int ret; tsk_table_collection_t tc; tsk_treeseq_t ts; char example_metadata[100] = "An example of metadata with unicode 🎄🌳🌴🌲🎋"; char example_metadata_schema[100] = "An example of metadata schema with unicode 🎄🌳🌴🌲🎋"; char example_time_units[100] = "An example of time units ⏰"; tsk_size_t example_metadata_length = (tsk_size_t) strlen(example_metadata); tsk_size_t example_time_units_length = (tsk_size_t) strlen(example_metadata_schema); tsk_size_t example_metadata_schema_length = (tsk_size_t) strlen(example_time_units); ret = tsk_table_collection_init(&tc, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tc.sequence_length = 1.0; ret = tsk_table_collection_build_index(&tc, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_metadata( &tc, example_metadata, example_metadata_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_metadata_schema( &tc, example_metadata_schema, example_metadata_schema_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_set_time_units( &tc, example_time_units, example_time_units_length); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, &tc, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_metadata_length(&ts), example_metadata_length); CU_ASSERT_EQUAL( tsk_treeseq_get_metadata_schema_length(&ts), example_metadata_schema_length); CU_ASSERT_EQUAL(tsk_memcmp(tsk_treeseq_get_metadata(&ts), example_metadata, example_metadata_length), 0); CU_ASSERT_EQUAL(tsk_memcmp(tsk_treeseq_get_metadata_schema(&ts), example_metadata_schema, example_metadata_schema_length), 0); CU_ASSERT_EQUAL(tsk_treeseq_get_time_units_length(&ts), example_time_units_length); CU_ASSERT_EQUAL(tsk_memcmp(tsk_treeseq_get_time_units(&ts), example_time_units, example_time_units_length), 0); tsk_treeseq_free(&ts); tsk_table_collection_free(&tc); } static int dummy_stat(tsk_size_t K, const double *X, tsk_size_t M, double *Y, void *params) { tsk_size_t k; CU_ASSERT_FATAL(M == K); CU_ASSERT_FATAL(params == NULL); for (k = 0; k < K; k++) { Y[k] = X[k]; } return 0; } static void test_time_uncalibrated(void) { int ret; tsk_table_collection_t tables; tsk_treeseq_t ts; tsk_treeseq_t ts2; tsk_size_t sample_set_sizes[] = { 2, 2 }; tsk_id_t samples[] = { 0, 1, 2, 3 }; tsk_size_t num_samples; double result[100]; double *W; double *sigma; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ts.time_uncalibrated, false); tsk_treeseq_free(&ts); ret = tsk_table_collection_set_time_units( &tables, TSK_TIME_UNITS_UNCALIBRATED, strlen(TSK_TIME_UNITS_UNCALIBRATED)); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ts.time_uncalibrated, true); tsk_treeseq_free(&ts); tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_table_collection_set_time_units( ts.tables, TSK_TIME_UNITS_UNCALIBRATED, strlen(TSK_TIME_UNITS_UNCALIBRATED)); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts2, ts.tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_allele_frequency_spectrum( &ts2, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_SITE, result); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_allele_frequency_spectrum( &ts2, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_UNCALIBRATED); ret = tsk_treeseq_allele_frequency_spectrum(&ts2, 2, sample_set_sizes, samples, 0, NULL, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_ALLOW_TIME_UNCALIBRATED, result); CU_ASSERT_EQUAL_FATAL(ret, 0); sigma = tsk_calloc(tsk_treeseq_get_num_nodes(&ts2), sizeof(double)); num_samples = tsk_treeseq_get_num_samples(&ts2); W = tsk_calloc(num_samples, sizeof(double)); ret = tsk_treeseq_general_stat(&ts2, 1, W, 1, dummy_stat, NULL, tsk_treeseq_get_num_trees(&ts2), tsk_treeseq_get_breakpoints(&ts2), TSK_STAT_SITE, sigma); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_general_stat(&ts2, 1, W, 1, dummy_stat, NULL, tsk_treeseq_get_num_trees(&ts2), tsk_treeseq_get_breakpoints(&ts2), TSK_STAT_BRANCH, sigma); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_UNCALIBRATED); ret = tsk_treeseq_general_stat(&ts2, 1, W, 1, dummy_stat, NULL, tsk_treeseq_get_num_trees(&ts2), tsk_treeseq_get_breakpoints(&ts2), TSK_STAT_BRANCH | TSK_STAT_ALLOW_TIME_UNCALIBRATED, sigma); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_divergence_matrix( &ts2, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_UNCALIBRATED); ret = tsk_treeseq_divergence_matrix(&ts2, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_ALLOW_TIME_UNCALIBRATED, result); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_safe_free(W); tsk_safe_free(sigma); tsk_treeseq_free(&ts); tsk_treeseq_free(&ts2); tsk_table_collection_free(&tables); } static void test_reference_sequence(void) { int ret; tsk_table_collection_t tables; tsk_treeseq_t ts; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FALSE(tsk_treeseq_has_reference_sequence(&ts)); tsk_treeseq_free(&ts); ret = tsk_reference_sequence_set_data(&tables.reference_sequence, "abc", 3); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_treeseq_has_reference_sequence(&ts)); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } static void test_split_edges_no_populations(void) { int ret; tsk_treeseq_t ts, split_ts; tsk_table_collection_t tables; tsk_id_t new_nodes[] = { 9, 10, 11 }; tsk_size_t num_new_nodes = 3; const char *metadata = "some metadata"; tsk_size_t j; tsk_node_t node; double time = 0.09; tsk_id_t ret_id; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret_id = tsk_table_collection_copy(ts.tables, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); tsk_treeseq_free(&ts); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_treeseq_init(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /* NOTE: haven't worked out the exact IDs on the branches here, just * for illustration. 0.25┊ 8 ┊ ┊ ┊ ┊ ┏━┻━┓ ┊ ┊ ┊ 0.20┊ ┃ ┃ ┊ ┊ 7 ┊ ┊ ┃ ┃ ┊ ┊ ┏━┻━┓ ┊ 0.17┊ 6 ┃ ┊ 6 ┊ ┃ ┃ ┊ ┊ ┏━┻┓ ┃ ┊ ┏━┻━┓ ┊ ┃ ┃ ┊ 0.09┊ 9 5 10┊ 9 5 ┊ 11 5 ┊ ┊ ┃ ┏┻┓ ┃ ┊ ┃ ┏━┻┓ ┊ ┃ ┏━┻┓ ┊ 0.07┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ 4 ┊ ┃ ┃ 4 ┊ ┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┊ ┃ ┃ ┏┻┓ ┊ 0.00┊ 0 1 3 2 ┊ 0 1 2 3 ┊ 0 1 2 3 ┊ 0.00 2.00 7.00 10.00 */ ret = tsk_treeseq_split_edges( &ts, time, 1234, 0, metadata, strlen(metadata), 0, &split_ts); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&split_ts), 3); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&split_ts), 12); for (j = 0; j < num_new_nodes; j++) { ret = tsk_treeseq_get_node(&split_ts, new_nodes[j], &node); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(node.time, time); CU_ASSERT_EQUAL(node.flags, 1234); CU_ASSERT_EQUAL(node.individual, TSK_NULL); CU_ASSERT_EQUAL(node.population, 0); CU_ASSERT_EQUAL(node.metadata_length, strlen(metadata)); CU_ASSERT_EQUAL(strncmp(node.metadata, metadata, strlen(metadata)), 0); } tsk_treeseq_free(&split_ts); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_split_edges_populations(void) { int ret; tsk_treeseq_t ts, split_ts; tsk_table_collection_t tables; double time = 0.5; tsk_node_t node; tsk_id_t valid_pops[] = { -1, 0, 1 }; tsk_id_t num_valid_pops = 3; tsk_id_t j, population, ret_id; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, 0, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1, 1, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_edge_table_add_row(&tables.edges, 0, 1, 1, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = 0; j < num_valid_pops; j++) { population = valid_pops[j]; ret = tsk_treeseq_split_edges(&ts, time, 0, population, NULL, 0, 0, &split_ts); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&split_ts), 1); CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&split_ts), 3); ret = tsk_treeseq_get_node(&split_ts, 2, &node); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(node.population, population); tsk_treeseq_free(&split_ts); } tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_split_edges_errors(void) { int ret; tsk_treeseq_t ts, split_ts; tsk_table_collection_t tables; double time = 0.5; tsk_id_t invalid_pops[] = { -2, 2, 3 }; tsk_id_t num_invalid_pops = 3; tsk_id_t j, population, ret_id; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0, 0, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret_id = tsk_node_table_add_row(&tables.nodes, 0, 1, 1, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 1); ret_id = tsk_edge_table_add_row(&tables.edges, 0, 1, 1, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_split_edges( &ts, TSK_UNKNOWN_TIME, 0, TSK_NULL, NULL, 0, 0, &split_ts); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_NONFINITE); for (j = 0; j < num_invalid_pops; j++) { population = invalid_pops[j]; ret = tsk_treeseq_split_edges(&ts, time, 0, population, NULL, 0, 0, &split_ts); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tsk_treeseq_free(&split_ts); } tsk_treeseq_free(&ts); ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 1, 1.0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, 0); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_split_edges(&ts, time, 0, population, NULL, 0, 0, &split_ts); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED); tsk_treeseq_free(&split_ts); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } static void test_extend_haplotypes_simple(void) { int ret; tsk_treeseq_t ts, ets; const char *nodes = "1 0 -1 -1\n" "1 0 -1 -1\n" "0 2.0 -1 -1\n"; const char *edges = "0 10 2 0\n" "0 10 2 1\n"; const char *sites = "0.0 0\n" "1.0 0\n"; const char *mutations = "0 0 1 -1 0.5\n" "1 1 1 -1 0.5\n"; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_extend_haplotypes(&ts, 10, 0, &ets); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE_FATAL(tsk_table_collection_equals(ts.tables, ets.tables, 0)); tsk_treeseq_free(&ts); tsk_treeseq_free(&ets); } static void test_extend_haplotypes_errors(void) { int ret; tsk_treeseq_t ts, ets; const char *nodes = "1 0 -1 -1\n" "1 0 -1 -1\n" "0 2.0 -1 -1\n"; const char *edges = "0 10 2 0\n" "0 10 2 1\n"; const char *sites = "0.0 0\n" "1.0 0\n"; const char *mutations = "0 0 1 -1 0.5\n" "1 1 1 -1 0.5\n"; const char *mutations_no_time = "0 0 1 -1\n" "1 1 1 -1\n"; // left, right, node source, dest, time const char *migrations = "0 10 0 0 1 0.5\n" "0 10 0 1 0 1.5\n"; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_extend_haplotypes(&ts, -2, 0, &ets); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EXTEND_EDGES_BAD_MAXITER); tsk_treeseq_free(&ts); tsk_treeseq_from_text( &ts, 10, nodes, edges, migrations, sites, mutations, NULL, NULL, 0); ret = tsk_treeseq_extend_haplotypes(&ts, 10, 0, &ets); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED); tsk_treeseq_free(&ts); tsk_treeseq_from_text( &ts, 10, nodes, edges, NULL, sites, mutations_no_time, NULL, NULL, 0); ret = tsk_treeseq_extend_haplotypes(&ts, 10, 0, &ets); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME); tsk_treeseq_free(&ts); tsk_treeseq_free(&ets); } static void assert_equal_except_edges_and_mutation_nodes( const tsk_treeseq_t *ts1, const tsk_treeseq_t *ts2) { tsk_table_collection_t t1, t2; int ret; ret = tsk_table_collection_copy(ts1->tables, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_copy(ts2->tables, &t2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_memset(t1.mutations.node, 0, t1.mutations.num_rows * sizeof(*t1.mutations.node)); tsk_memset(t2.mutations.node, 0, t2.mutations.num_rows * sizeof(*t2.mutations.node)); tsk_edge_table_clear(&t1.edges); tsk_edge_table_clear(&t2.edges); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); tsk_table_collection_free(&t1); tsk_table_collection_free(&t2); } static void test_extend_haplotypes(void) { int ret = 0; int max_iter = 10; tsk_treeseq_t ts, ets; FILE *tmp = fopen(_tmp_file_name, "w"); /* 7 and 8 should be extended to the whole sequence; * also 5 to the second tree (where x's are) 6 6 6 6 +-+-+ +-+-+ +-+-+ +-+-+ | | 7 x x 8 x x | | ++-+ | | +-++ | | 4 5 4 | x 4 | 5 4 5 +++ +++ +++ | | | | +++ +++ +++ 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 */ const char *nodes = "1 0 -1 -1\n" "1 0 -1 -1\n" "1 0 -1 -1\n" "1 0 -1 -1\n" "0 1.0 -1 -1\n" "0 1.0 -1 -1\n" "0 3.0 -1 -1\n" "0 2.0 -1 -1\n" "0 2.0 -1 -1\n"; // l, r, p, c const char *edges = "0 10 4 0\n" "0 5 4 1\n" "7 10 4 1\n" "0 2 5 2\n" "5 10 5 2\n" "0 2 5 3\n" "5 10 5 3\n" "2 5 7 2\n" "2 5 7 4\n" "5 7 8 1\n" "5 7 8 5\n" "2 5 6 3\n" "0 2 6 4\n" "5 10 6 4\n" "0 2 6 5\n" "7 10 6 5\n" "2 5 6 7\n" "5 7 6 8\n"; const char *sites = "0.0 0\n" "9.0 0\n"; const char *mutations = "0 4 1 -1 2.5\n" "0 4 2 0 1.5\n" "1 6 3 -1 3.5\n" "1 5 1 2 2.5\n" "1 5 2 3 1.5\n"; tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); for (max_iter = 1; max_iter < 10; max_iter++) { ret = tsk_treeseq_extend_haplotypes(&ts, max_iter, 0, &ets); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_equal_except_edges_and_mutation_nodes(&ts, &ets); CU_ASSERT_TRUE(ets.tables->edges.num_rows >= 12); tsk_treeseq_free(&ets); } ret = tsk_treeseq_extend_haplotypes(&ts, max_iter, 0, &ets); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(ets.tables->nodes.num_rows, 9); CU_ASSERT_EQUAL_FATAL(ets.tables->edges.num_rows, 12); assert_equal_except_edges_and_mutation_nodes(&ts, &ets); tsk_treeseq_free(&ets); tsk_set_debug_stream(tmp); ret = tsk_treeseq_extend_haplotypes(&ts, max_iter, TSK_DEBUG, &ets); tsk_set_debug_stream(stdout); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(ftell(tmp) > 0); tsk_treeseq_free(&ets); fclose(tmp); tsk_treeseq_free(&ts); } static void test_extend_haplotypes_conflicting_times(void) { int ret; int max_iter = 10; tsk_treeseq_t ts, ets; /* 3.00┊ 3 ┊ 4 ┊ ┊ ┃ ┊ ┃ ┊ 2.00┊ ┃ ┊ 2 ┊ ┊ ┃ ┊ ┃ ┊ 1.00┊ 1 ┊ ┃ ┊ ┊ ┃ ┊ ┃ ┊ 0.00┊ 0 ┊ 0 ┊ 0 2 4 */ const char *nodes = "1 0.0 -1 -1\n" "0 1.0 -1 -1\n" "0 2.0 -1 -1\n" "0 3.0 -1 -1\n" "0 3.0 -1 -1\n"; // l, r, p, c const char *edges = "0.0 2.0 1 0\n" "2.0 4.0 2 0\n" "0.0 2.0 3 1\n" "2.0 4.0 4 2\n"; tsk_treeseq_from_text(&ts, 4, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ts.tables->edges.num_rows, 4); ret = tsk_treeseq_extend_haplotypes(&ts, max_iter, 0, &ets); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, ets.tables, 0)); tsk_treeseq_free(&ets); tsk_treeseq_free(&ts); } static void test_extend_haplotypes_new_edge(void) { int ret; int max_iter = 10; tsk_treeseq_t ts, ets, ref_ts; /* This is an example where new edges are added * on both forwards and back passes 4.00┊ ┊ 4 ┊ 4 ┊ 4 ┊ ┊ ┊ ┃ ┊ ┃ ┊ ┃ ┊ 3.00┊ 2 ┊ ┃ ┊ 2 ┊ 2 ┊ ┊ ┃ ┊ ┃ ┊ ┃ ┊ ┃ ┊ 2.00┊ ┃ ┊ 3 ┊ ┃ ┊ 3 ┊ ┊ ┃ ┊ ┃ ┊ ┃ ┊ ┃ ┊ 1.00┊ 1 ┊ ┃ ┊ ┃ ┊ ┃ ┊ ┊ ┃ ┊ ┃ ┊ ┃ ┊ ┃ ┊ 0.00┊ 0 ┊ 0 ┊ 0 ┊ 0 ┊ 0 2 4 6 8 */ const char *nodes = "1 0.0 -1 -1\n" "0 1.0 -1 -1\n" "0 3.0 -1 -1\n" "0 2.0 -1 -1\n" "0 4.0 -1 -1\n"; // l, r, p, c const char *edges = "0.0 2.0 1 0\n" "2.0 4.0 3 0\n" "6.0 8.0 3 0\n" "4.0 5.0 2 0\n" "5.0 6.0 2 0\n" "0.0 2.0 2 1\n" "6.0 7.0 2 3\n" "7.0 8.0 2 3\n" "4.0 8.0 4 2\n" "2.0 4.0 4 3\n"; const char *ext_edges = "0.0 8.0 1 0\n" "0.0 8.0 3 1\n" "0.0 8.0 2 3\n" "2.0 8.0 4 2\n"; const char *sites = "3.0 0\n"; // s, n , ds, t const char *mutations = "0 4 5 -1 4.5\n" "0 3 4 0 3.5\n" "0 3 3 1 2.5\n" "0 0 2 2 1.5\n" "0 0 1 3 0.5\n"; const char *ext_mutations = "0 4 5 -1 4.5\n" "0 2 4 0 3.5\n" "0 3 3 1 2.5\n" "0 1 2 2 1.5\n" "0 0 1 3 0.5\n"; tsk_treeseq_from_text(&ts, 8, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ts.tables->edges.num_rows, 10); tsk_treeseq_from_text( &ref_ts, 8, nodes, ext_edges, NULL, sites, ext_mutations, NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ref_ts.tables->edges.num_rows, 4); ret = tsk_treeseq_extend_haplotypes(&ts, max_iter, 0, &ets); CU_ASSERT_EQUAL_FATAL(ret, 0); assert_equal_except_edges_and_mutation_nodes(&ts, &ets); CU_ASSERT_TRUE(tsk_table_collection_equals(ets.tables, ref_ts.tables, 0)); tsk_treeseq_free(&ets); tsk_treeseq_free(&ts); tsk_treeseq_free(&ref_ts); } static void test_init_take_ownership_no_edge_metadata(void) { int ret; tsk_treeseq_t ts; tsk_table_collection_t *tables = tsk_malloc(sizeof(tsk_table_collection_t)); CU_ASSERT_NOT_EQUAL_FATAL(tables, NULL); tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, tables, TSK_TC_NO_EDGE_METADATA); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); ret = tsk_treeseq_init(&ts, tables, TSK_TAKE_OWNERSHIP); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA); tsk_treeseq_free(&ts); } static void test_init_compute_mutation_parents(void) { int ret; tsk_table_collection_t *tables, *tables2; tsk_treeseq_t ts; const char *sites = "0 0\n"; /* Make a mutation on a parallel branch the parent*/ const char *bad_mutations = "0 0 1 -1\n" "0 1 1 0\n"; tables = tsk_malloc(sizeof(tsk_table_collection_t)); CU_ASSERT_NOT_EQUAL_FATAL(tables, NULL); tables2 = tsk_malloc(sizeof(tsk_table_collection_t)); CU_ASSERT_NOT_EQUAL_FATAL(tables2, NULL); CU_ASSERT_FATAL(tables != NULL); ret = tsk_table_collection_init(tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables->sequence_length = 1; parse_nodes(single_tree_ex_nodes, &tables->nodes); CU_ASSERT_EQUAL_FATAL(tables->nodes.num_rows, 7); parse_edges(single_tree_ex_edges, &tables->edges); CU_ASSERT_EQUAL_FATAL(tables->edges.num_rows, 6); parse_sites(sites, &tables->sites); CU_ASSERT_EQUAL_FATAL(tables->sites.num_rows, 1); parse_mutations(bad_mutations, &tables->mutations); CU_ASSERT_EQUAL_FATAL(tables->mutations.num_rows, 2); tables->sequence_length = 1.0; ret = tsk_table_collection_copy(tables, tables2, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(&ts, tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_MUTATION_PARENT); tsk_treeseq_free(&ts); ret = tsk_treeseq_init( &ts, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_treeseq_free(&ts); /* When we use take ownership, the check of parents shouldn't overwrite them*/ ret = tsk_treeseq_init(&ts, tables, TSK_TAKE_OWNERSHIP | TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_MUTATION_PARENT); CU_ASSERT_EQUAL(tables->mutations.parent[0], -1); CU_ASSERT_EQUAL(tables->mutations.parent[1], 0); tsk_treeseq_free(&ts); /* When we use take ownership and compute, the tables are overwritten*/ ret = tsk_treeseq_init(&ts, tables2, TSK_TAKE_OWNERSHIP | TSK_TS_INIT_BUILD_INDEXES | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(tables2->mutations.parent[0], -1); CU_ASSERT_EQUAL(tables2->mutations.parent[1], -1); /* Don't need to free tables as we took ownership */ tsk_treeseq_free(&ts); } static void test_init_compute_mutation_parents_errors(void) { int ret; tsk_id_t row_ret; tsk_table_collection_t tables; tsk_treeseq_t ts; const char *sites = "0.5 0\n" "0 0\n"; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; parse_nodes(single_tree_ex_nodes, &tables.nodes); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); parse_edges(single_tree_ex_edges, &tables.edges); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); parse_sites(sites, &tables.sites); CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 2); tables.sequence_length = 1.0; ret = tsk_treeseq_init( &ts, &tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_SITES); tsk_treeseq_free(&ts); tsk_site_table_clear(&tables.sites); row_ret = tsk_site_table_add_row(&tables.sites, 0.5, "A", 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(row_ret, 0); row_ret = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, "A", 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(row_ret, 0); row_ret = tsk_mutation_table_add_row( &tables.mutations, 0, 4, TSK_NULL, TSK_UNKNOWN_TIME, "A", 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(row_ret, 1); ret = tsk_treeseq_init( &ts, &tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TS_INIT_COMPUTE_MUTATION_PARENTS); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_AFTER_CHILD); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } int main(int argc, char **argv) { CU_TestInfo tests[] = { /* simplest example tests */ { "test_simplest_discrete_genome", test_simplest_discrete_genome }, { "test_simplest_discrete_time", test_simplest_discrete_time }, { "test_simplest_min_time", test_simplest_min_time }, { "test_simplest_max_time", test_simplest_max_time }, { "test_simplest_records", test_simplest_records }, { "test_simplest_nonbinary_records", test_simplest_nonbinary_records }, { "test_simplest_unary_records", test_simplest_unary_records }, { "test_simplest_unary_with_individuals", test_simplest_unary_with_individuals }, { "test_simplest_non_sample_leaf_records", test_simplest_non_sample_leaf_records }, { "test_simplest_degenerate_multiple_root_records", test_simplest_degenerate_multiple_root_records }, { "test_simplest_multiple_root_records", test_simplest_multiple_root_records }, { "test_simplest_zero_root_tree", test_simplest_zero_root_tree }, { "test_simplest_multi_root_tree", test_simplest_multi_root_tree }, { "test_simplest_tree_mrca", test_simplest_tree_mrca }, { "test_simplest_root_mutations", test_simplest_root_mutations }, { "test_simplest_back_mutations", test_simplest_back_mutations }, { "test_simplest_general_samples", test_simplest_general_samples }, { "test_simplest_holey_tree_sequence", test_simplest_holey_tree_sequence }, { "test_simplest_holey_tsk_treeseq_zero_roots", test_simplest_holey_tsk_treeseq_zero_roots }, { "test_simplest_holey_tsk_treeseq_mutation_parents", test_simplest_holey_tsk_treeseq_mutation_parents }, { "test_simplest_initial_gap_tree_sequence", test_simplest_initial_gap_tree_sequence }, { "test_simplest_initial_gap_zero_roots", test_simplest_initial_gap_zero_roots }, { "test_simplest_initial_gap_tsk_treeseq_mutation_parents", test_simplest_initial_gap_tsk_treeseq_mutation_parents }, { "test_simplest_final_gap_tree_sequence", test_simplest_final_gap_tree_sequence }, { "test_simplest_final_gap_tsk_treeseq_mutation_parents", test_simplest_final_gap_tsk_treeseq_mutation_parents }, { "test_simplest_individuals", test_simplest_individuals }, { "test_simplest_bad_individuals", test_simplest_bad_individuals }, { "test_simplest_bad_edges", test_simplest_bad_edges }, { "test_simplest_bad_indexes", test_simplest_bad_indexes }, { "test_simplest_bad_migrations", test_simplest_bad_migrations }, { "test_simplest_migration_simplify", test_simplest_migration_simplify }, { "test_simplest_overlapping_parents", test_simplest_overlapping_parents }, { "test_simplest_contradictory_children", test_simplest_contradictory_children }, { "test_simplest_overlapping_edges_simplify", test_simplest_overlapping_edges_simplify }, { "test_simplest_overlapping_unary_edges_simplify", test_simplest_overlapping_unary_edges_simplify }, { "test_simplest_overlapping_unary_edges_internal_samples_simplify", test_simplest_overlapping_unary_edges_internal_samples_simplify }, { "test_simplest_reduce_site_topology", test_simplest_reduce_site_topology }, { "test_simplest_simplify_defragment", test_simplest_simplify_defragment }, { "test_simplest_population_filter", test_simplest_population_filter }, { "test_simplest_individual_filter", test_simplest_individual_filter }, { "test_simplest_no_node_filter", test_simplest_no_node_filter }, { "test_simplest_no_update_flags", test_simplest_no_update_flags }, { "test_simplest_map_mutations", test_simplest_map_mutations }, { "test_simplest_nonbinary_map_mutations", test_simplest_nonbinary_map_mutations }, { "test_simplest_unary_map_mutations", test_simplest_unary_map_mutations }, { "test_simplest_non_sample_leaf_map_mutations", test_simplest_non_sample_leaf_map_mutations }, { "test_simplest_internal_sample_map_mutations", test_simplest_internal_sample_map_mutations }, { "test_simplest_multiple_root_map_mutations", test_simplest_multiple_root_map_mutations }, { "test_simplest_chained_map_mutations", test_simplest_chained_map_mutations }, { "test_simplest_mutation_edges", test_simplest_mutation_edges }, /* Single tree tests */ { "test_single_tree_good_records", test_single_tree_good_records }, { "test_single_nonbinary_tree_good_records", test_single_nonbinary_tree_good_records }, { "test_single_tree_bad_records", test_single_tree_bad_records }, { "test_single_tree_good_mutations", test_single_tree_good_mutations }, { "test_single_tree_bad_mutations", test_single_tree_bad_mutations }, { "test_single_tree_iter", test_single_tree_iter }, { "test_single_tree_general_samples_iter", test_single_tree_general_samples_iter }, { "test_single_nonbinary_tree_iter", test_single_nonbinary_tree_iter }, { "test_single_tree_iter_times", test_single_tree_iter_times }, { "test_single_tree_iter_depths", test_single_tree_iter_depths }, { "test_single_tree_simplify", test_single_tree_simplify }, { "test_single_tree_simplify_debug", test_single_tree_simplify_debug }, { "test_single_tree_simplify_keep_input_roots", test_single_tree_simplify_keep_input_roots }, { "test_single_tree_simplify_no_sample_nodes", test_single_tree_simplify_no_sample_nodes }, { "test_single_tree_simplify_null_samples", test_single_tree_simplify_null_samples }, { "test_single_tree_compute_mutation_parents", test_single_tree_compute_mutation_parents }, { "test_single_tree_compute_mutation_times", test_single_tree_compute_mutation_times }, { "test_single_tree_mutation_edges", test_single_tree_mutation_edges }, { "test_single_tree_is_descendant", test_single_tree_is_descendant }, { "test_single_tree_total_branch_length", test_single_tree_total_branch_length }, { "test_single_tree_num_lineages", test_single_tree_num_lineages }, { "test_single_tree_map_mutations", test_single_tree_map_mutations }, { "test_single_tree_map_mutations_internal_samples", test_single_tree_map_mutations_internal_samples }, { "test_single_tree_tracked_samples", test_single_tree_tracked_samples }, { "test_single_tree_tree_pos", test_single_tree_tree_pos }, /* Multi tree tests */ { "test_simple_multi_tree", test_simple_multi_tree }, { "test_multi_tree_direction_switching_tree_pos", test_multi_tree_direction_switching_tree_pos }, { "test_nonbinary_multi_tree", test_nonbinary_multi_tree }, { "test_unary_multi_tree", test_unary_multi_tree }, { "test_internal_sample_multi_tree", test_internal_sample_multi_tree }, { "test_internal_sample_simplified_multi_tree", test_internal_sample_simplified_multi_tree }, { "test_simplify_keep_input_roots_multi_tree", test_simplify_keep_input_roots_multi_tree }, { "test_left_to_right_multi_tree", test_left_to_right_multi_tree }, { "test_gappy_multi_tree", test_gappy_multi_tree }, { "test_convenience_arrays_multi_tree", test_convenience_arrays_multi_tree }, { "test_tsk_treeseq_bad_records", test_tsk_treeseq_bad_records }, /* multiroot tests */ { "test_multiroot_mrca", test_multiroot_mrca }, /* Sample sets */ { "test_simple_sample_sets", test_simple_sample_sets }, { "test_nonbinary_sample_sets", test_nonbinary_sample_sets }, { "test_internal_sample_sample_sets", test_internal_sample_sample_sets }, { "test_non_sample_leaf_sample_lists", test_non_sample_leaf_sample_lists }, { "test_no_sample_count_semantics", test_no_sample_count_semantics }, { "test_virtual_root_properties", test_virtual_root_properties }, /* tree traversal orders */ { "test_single_tree_traversal", test_single_tree_traversal }, { "test_multiroot_tree_traversal", test_multiroot_tree_traversal }, /* Seek */ { "test_seek_multi_tree", test_seek_multi_tree }, { "test_seek_errors", test_seek_errors }, /* KC distance tests */ { "test_single_tree_kc", test_single_tree_kc }, { "test_isolated_node_kc", test_isolated_node_kc }, { "test_two_trees_kc", test_two_trees_kc }, { "test_empty_tree_kc", test_empty_tree_kc }, { "test_nonbinary_tree_kc", test_nonbinary_tree_kc }, { "test_nonzero_samples_kc", test_nonzero_samples_kc }, { "test_internal_samples_kc", test_internal_samples_kc }, { "test_non_sample_leaf_kc", test_non_sample_leaf_kc }, { "test_unequal_sample_size_kc", test_unequal_sample_size_kc }, { "test_unequal_samples_kc", test_unequal_samples_kc }, { "test_unary_nodes_kc", test_unary_nodes_kc }, { "test_no_sample_lists_kc", test_no_sample_lists_kc }, { "test_unequal_sequence_lengths_kc", test_unequal_sequence_lengths_kc }, { "test_different_number_trees_kc", test_different_number_trees_kc }, { "test_offset_trees_with_errors_kc", test_offset_trees_with_errors_kc }, /* Tree balance/imbalance index tests */ { "test_single_tree_balance", test_single_tree_balance }, { "test_multiroot_balance", test_multiroot_balance }, { "test_nonbinary_balance", test_nonbinary_balance }, { "test_empty_tree_balance", test_empty_tree_balance }, { "test_b2_bad_base", test_b2_bad_base }, /* Misc */ { "test_tree_errors", test_tree_errors }, { "test_treeseq_row_access_errors", test_treeseq_row_access_errors }, { "test_treeseq_get_individuals_population_errors", test_treeseq_get_individuals_population_errors }, { "test_treeseq_get_individuals_population", test_treeseq_get_individuals_population }, { "test_treeseq_get_individuals_time_errors", test_treeseq_get_individuals_time_errors }, { "test_treeseq_get_individuals_time", test_treeseq_get_individuals_time }, { "test_tree_copy_flags", test_tree_copy_flags }, { "test_genealogical_nearest_neighbours_errors", test_genealogical_nearest_neighbours_errors }, { "test_deduplicate_sites", test_deduplicate_sites }, { "test_deduplicate_sites_errors", test_deduplicate_sites_errors }, { "test_deduplicate_sites_zero_rows", test_deduplicate_sites_zero_rows }, { "test_deduplicate_sites_multichar", test_deduplicate_sites_multichar }, { "test_empty_tree_sequence", test_empty_tree_sequence }, { "test_zero_edges", test_zero_edges }, { "test_tree_sequence_metadata", test_tree_sequence_metadata }, { "test_time_uncalibrated", test_time_uncalibrated }, { "test_reference_sequence", test_reference_sequence }, { "test_split_edges_no_populations", test_split_edges_no_populations }, { "test_split_edges_populations", test_split_edges_populations }, { "test_split_edges_errors", test_split_edges_errors }, { "test_extend_haplotypes_simple", test_extend_haplotypes_simple }, { "test_extend_haplotypes_errors", test_extend_haplotypes_errors }, { "test_extend_haplotypes", test_extend_haplotypes }, { "test_extend_haplotypes_new_edge", test_extend_haplotypes_new_edge }, { "test_extend_haplotypes_conflicting_times", test_extend_haplotypes_conflicting_times }, { "test_init_take_ownership_no_edge_metadata", test_init_take_ownership_no_edge_metadata }, { "test_init_compute_mutation_parents", test_init_compute_mutation_parents }, { "test_init_compute_mutation_parents_errors", test_init_compute_mutation_parents_errors }, { NULL, NULL }, }; return test_main(tests, argc, argv); } ================================================ FILE: c/tests/testlib.c ================================================ /* * MIT License * * Copyright (c) 2019-2024 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "testlib.h" char *_tmp_file_name; FILE *_devnull; /* Simple single tree example. */ const char *single_tree_ex_nodes = /* 6 */ "1 0 -1 -1\n" /* / \ */ "1 0 -1 -1\n" /* / \ */ "1 0 -1 -1\n" /* / \ */ "1 0 -1 -1\n" /* / 5 */ "0 1 -1 -1\n" /* 4 / \ */ "0 2 -1 -1\n" /* / \ / \ */ "0 3 -1 -1\n"; /* 0 1 2 3 */ const char *single_tree_ex_edges = "0 1 4 0,1\n" "0 1 5 2,3\n" "0 1 6 4,5\n"; const char *single_tree_ex_sites = "0.125 0\n" "0.25 0\n" "0.5 0\n"; /* site, node, derived_state, [parent, time] */ const char *single_tree_ex_mutations = "0 2 1 -1\n" "1 4 1 -1\n" "1 0 0 1\n" /* Back mutation over 0 */ "2 0 1 -1\n" /* recurrent mutations over samples */ "2 1 1 -1\n" "2 2 1 -1\n" "2 3 1 -1\n"; /*** Example from the PLOS paper ***/ /* 0.25┊ 8 ┊ ┊ ┊ ┊ ┏━┻━┓ ┊ ┊ ┊ 0.20┊ ┃ ┃ ┊ ┊ 7 ┊ ┊ ┃ ┃ ┊ ┊ ┏━┻━┓ ┊ 0.17┊ 6 ┃ ┊ 6 ┊ ┃ ┃ ┊ ┊ ┏━┻┓ ┃ ┊ ┏━┻━┓ ┊ ┃ ┃ ┊ 0.09┊ ┃ 5 ┃ ┊ ┃ 5 ┊ ┃ 5 ┊ ┊ ┃ ┏┻┓ ┃ ┊ ┃ ┏━┻┓ ┊ ┃ ┏━┻┓ ┊ 0.07┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ 4 ┊ ┃ ┃ 4 ┊ ┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┊ ┃ ┃ ┏┻┓ ┊ 0.00┊ 0 1 3 2 ┊ 0 1 2 3 ┊ 0 1 2 3 ┊ 0.00 2.00 7.00 10.00 */ const char *paper_ex_nodes = "1 0 -1 0\n" "1 0 -1 0\n" "1 0 -1 1\n" "1 0 -1 1\n" "0 0.071 -1 -1\n" "0 0.090 -1 -1\n" "0 0.170 -1 -1\n" "0 0.202 -1 -1\n" "0 0.253 -1 -1\n"; const char *paper_ex_edges = "2 10 4 2\n" "2 10 4 3\n" "0 10 5 1\n" "0 2 5 3\n" "2 10 5 4\n" "0 7 6 0,5\n" "7 10 7 0,5\n" "0 2 8 2,6\n"; /* We make one mutation for each tree */ const char *paper_ex_sites = "1 0\n" "4.5 0\n" "8.5 0\n"; const char *paper_ex_mutations = "0 2 1\n" "1 0 1\n" "2 5 1\n"; /* Two (diploid) individuals */ const char *paper_ex_individuals = "0 0.2,1.5 -1,-1\n" "0 0.0,0.0 -1,-1\n"; /*** An example of a nonbinary tree sequence ***/ /* 0.41┊ 12 ┊ 12 ┊ ┊ ┏━━┻━━┓ ┊ ┏━┻━━┓ ┊ 0.28┊ ┃ ┃ ┊ 11 ┃ ┊ ┊ ┃ ┃ ┊ ┏━┻━┓ ┃ ┊ 0.13┊ ┃ 10 ┊ ┃ ┃ 10 ┊ ┊ ┃ ┏━╋━┓ ┊ ┃ ┃ ┏┻┓ ┊ 0.07┊ 9 ┃ ┃ ┃ ┊ 9 ┃ ┃ ┃ ┊ ┊ ┏━━┻━┓ ┃ ┃ ┃ ┊ ┏━━┻━┓ ┃ ┃ ┃ ┊ 0.01┊ 8 ┃ ┃ ┃ ┃ ┊ 8 ┃ ┃ ┃ ┃ ┊ ┊ ┏━┳┻┳━┓ ┃ ┃ ┃ ┃ ┊ ┏━┳┻┳━┓ ┃ ┃ ┃ ┃ ┊ 0.00┊ 0 1 2 3 6 4 5 7 ┊ 0 1 2 3 6 5 4 7 ┊ 0 17 100 */ const char *nonbinary_ex_nodes = "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" "0 0.01 0 -1\n" "0 0.068 0 -1\n" "0 0.130 0 -1\n" "0 0.279 0 -1\n" "0 0.405 0 -1\n"; const char *nonbinary_ex_edges = "0 100 8 0,1,2,3\n" "0 100 9 6,8\n" "0 100 10 4\n" "0 17 10 5\n" "0 100 10 7\n" "17 100 11 5,9\n" "0 17 12 9\n" "0 100 12 10\n" "17 100 12 11"; const char *nonbinary_ex_sites = "1 0\n" "18 0\n"; const char *nonbinary_ex_mutations = "0 2 1\n" "1 11 1"; /*** An example of a tree sequence with unary nodes * and also a non-sample leaf (node 9). ***/ /* 0.25┊ 8 ┊ 8 ┊ ┊ ┊ ┊ ┏━━┻━━┓ ┊ ┃ ┊ ┊ ┊ 0.20┊ ┃ 7 ┊ ┃ ┊ 7 ┊ ┊ ┊ ┃ ┃ ┊ ┃ ┊ ┏━┻━━┓ ┊ ┊ 0.17┊ 6 ┃ ┊ 6 ┊ ┃ ┃ ┊ ┊ ┊ ┏━┻━┓ ┃ ┊ ┏━┻━━┓ ┊ ┃ ┃ ┊ ┊ 0.09┊ ┃ 5 ┃ ┊ ┃ 5 ┊ ┃ 5 ┊ ┊ ┊ ┃ ┏━╋━┓ ┃ ┊ ┃ ┏━━╋━━┓ ┊ ┃ ┏━━╋━━┓ ┊ ┊ 0.07┊ ┃ ┃ ┃ ┃ ┃ ┊ ┃ ┃ 4 ┃ ┊ ┃ ┃ 4 ┃ ┊ ┊ ┊ ┃ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┃ ┊ ┃ ┃ ┏┻┓ ┃ ┊ ┊ 0.00┊ 0 1 3 9 2 ┊ 0 1 2 3 9 ┊ 0 1 2 3 9 ┊ 0 1 2 3 ┊ 0 2 7 10 100 */ const char *unary_ex_nodes = "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" "0 0.071 0 -1\n" "0 0.090 0 -1\n" "0 0.170 0 -1\n" "0 0.202 0 -1\n" "0 0.253 0 -1\n" "0 0 0 -1\n"; const char *unary_ex_edges = "2 10 4 2,3\n" "0 10 5 1\n" "0 2 5 3\n" "2 10 5 4\n" "0 10 5 9\n" "0 7 6 0,5\n" "7 10 7 0\n" "0 2 7 2\n" "7 10 7 5\n" "0 7 8 6\n" "0 2 8 7\n"; /* We make one mutation for each tree, over unary nodes if they exist */ const char *unary_ex_sites = "1.0 0\n" "4.5 0\n" "8.5 0\n"; const char *unary_ex_mutations = "0 2 1\n" "1 6 1\n" "1 9 0\n" "2 5 1\n"; /* An example of a simple tree sequence with multiple marginal trees. */ /* Simple single tree example. */ const char *multiple_tree_ex_nodes = /* */ "1 0 -1 -1\n" /* 6 | */ "1 0 -1 -1\n" /* / \ | */ "1 0 -1 -1\n" /* / \ | 5 */ "0 1 -1 -1\n" /* 4 \ | / \ */ "0 2 -1 -1\n" /* / \ \ | / 3 */ "0 3 -1 -1\n" /* / \ \ | / / \ */ "0 4 -1 -1\n"; /* 0 1 2 | 0 1 2 */ /* |----------------|---------------| */ /* 0 1 2 */ const char *multiple_tree_ex_edges = "0.75 1.0 3 1,2\n" "0.0 0.75 4 0,1\n" "0.75 1.0 5 0,3\n" "0.0 0.75 6 2,4\n"; /* Odd topology -- different roots. */ const char *odd_tree1_ex_nodes = /* | | 5 */ "1 0 -1 -1\n" /* | 4 | | */ "1 0 -1 -1\n" /* 3 | | | | */ "0 1 -1 -1\n" /* | | | | | */ "0 2 -1 -1\n" /* 2 | 2 | 2 */ "0 3 -1 -1\n" /* / \ | / \ | / \ */ "0 4 -1 -1\n"; /* 0 1 | 0 1 | 0 1 */ /* |------|-------|------| */ /* 0.0 0.2 0.7 1.0*/ const char *odd_tree1_ex_edges = "0.0 1.0 2 0,1\n" "0.0 0.2 3 2\n" "0.2 0.7 4 2\n" "0.7 1.0 4 2\n"; /* An example where some samples descend from other samples, and multiple roots */ const char *multi_root_tree_ex_nodes = "1 0 -1 -1\n" /* 4 5 */ "1 0 -1 -1\n" /* | | */ "1 1 -1 -1\n" /* 2 3 */ "1 1 -1 -1\n" /* | | */ "0 2 -1 -1\n" /* 0 1 */ "0 2 -1 -1\n"; const char *multi_root_tree_ex_edges = "0 1 2 0\n" "0 1 3 1\n" "0 1 4 2\n" "0 1 5 3\n"; /* Examples of tree sequences where samples have different paths to the same ancestor. */ const char *multi_path_tree_ex_nodes = /* 5 | */ "1 0 -1 -1\n" /* / \ | */ "1 0 -1 -1\n" /* / 4 | 4 */ "1 0 -1 -1\n" /* / / \ | / \ */ "0 1 -1 -1\n" /* / / \ | 3 \ */ "0 2 -1 -1\n" /* / / \ | / \ \ */ "0 3 -1 -1\n"; /* 0 2 1 | 0 2 1 */ /*----------------|------------ */ /*0.0 0.2 1.0*/ const char *multi_path_tree_ex_edges = "0.2 1.0 3 0\n" "0.2 1.0 3 2\n" "0.0 1.0 4 1\n" "0.0 0.2 4 2\n" "0.2 1.0 4 3\n" "0.0 0.2 5 0\n" "0.0 0.2 5 4\n"; const char *multi_path_tree_ex2_nodes = "1 0 -1 -1\n" "1 0 -1 -1\n" "0 1 -1 -1\n" "0 2 -1 -1\n" "0 3 -1 -1\n"; const char *multi_path_tree_ex2_edges = "0.6 1.0 2 1\n" "0.0 1.0 3 0\n" "0.0 0.6 4 1\n" "0.6 1.0 4 2\n" "0.0 1.0 4 3\n"; /* An example of a tree sequence with internally sampled nodes. */ /* 1.20┊ ┊ 8 ┊ ┊ ┊ ┊ ┏━┻━┓ ┊ ┊ 1.00┊ 7 ┊ ┃ ┃ ┊ ┊ ┊ ┏━┻━┓ ┊ ┃ ┃ ┊ ┊ 0.70┊ ┃ ┃ ┊ ┃ ┃ ┊ 6 ┊ ┊ ┃ ┃ ┊ ┃ ┃ ┊ ┏━┻━┓ ┊ 0.50┊ ┃ 5 ┊ 5 ┃ ┊ ┃ 5 ┊ ┊ ┃ ┏━┻┓ ┊ ┏┻━┓ ┃ ┊ ┃ ┏━┻┓ ┊ 0.40┊ ┃ ┃ 4 ┊ 4 ┃ ┃ ┊ ┃ ┃ 4 ┊ ┊ ┃ ┃ ┏┻┓ ┊ ┏┻┓ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┊ 0.20┊ ┃ ┃ ┃ 3 ┊ ┃ ┃ ┃ 3 ┊ ┃ ┃ ┃ 3 ┊ ┊ ┃ ┃ ┃ ┊ ┃ ┃ ┃ ┊ ┃ ┃ ┃ ┊ 0.10┊ ┃ 1 2 ┊ ┃ 2 1 ┊ ┃ 1 2 ┊ ┊ ┃ ┊ ┃ ┊ ┃ ┊ 0.00┊ 0 ┊ 0 ┊ 0 ┊ 0.00 2.00 8.00 10.00 */ const char *internal_sample_ex_nodes = "1 0.0 0 -1\n" "1 0.1 0 -1\n" "1 0.1 0 -1\n" "1 0.2 0 -1\n" "0 0.4 0 -1\n" "1 0.5 0 -1\n" "0 0.7 0 -1\n" "0 1.0 0 -1\n" "0 1.2 0 -1\n"; const char *internal_sample_ex_edges = "2 8 4 0\n" "0 10 4 2\n" "0 2 4 3\n" "8 10 4 3\n" "0 10 5 1,4\n" "8 10 6 0,5\n" "0 2 7 0,5\n" "2 8 8 3,5\n"; /* We make one mutation for each tree, some above the internal node */ const char *internal_sample_ex_sites = "1.0 0\n" "4.5 0\n" "8.5 0\n"; const char *internal_sample_ex_mutations = "0 2 1\n" "1 5 1\n" "2 5 1\n"; /*** An example of a tree sequence with multiple roots. ***/ /* 0.90┊ ┊ 11 ┊ ┊ ┊ ┊ ┏┻┓ ┊ ┊ 0.80┊ 10 ┊ ┃ ┃ ┊ ┊ ┊ ┏┻┓ ┊ ┃ ┃ ┊ ┊ 0.40┊ 9 ┃ ┃ ┊ 9 ┃ ┃ ┊ 9 ┊ ┊ ┏━┻┓ ┃ ┃ ┊ ┏━┻━┓ ┃ ┃ ┊ ┏━┻━━┓ ┊ 0.30┊ ┃ ┃ ┃ ┃ ┊ ┃ 8 ┃ ┃ ┊ ┃ 8 ┊ ┊ ┃ ┃ ┃ ┃ ┊ ┃ ┏┻┓ ┃ ┃ ┊ ┃ ┏┻┓ ┊ 0.20┊ ┃ 7 ┃ ┃ ┊ 7 ┃ ┃ ┃ ┃ ┊ 7 ┃ ┃ ┊ ┊ ┃ ┏┻┓ ┃ ┃ ┊ ┏┻┓ ┃ ┃ ┃ ┃ ┊ ┏━┻┓ ┃ ┃ ┊ 0.10┊ ┃ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┃ ┃ ┃ ┃ ┊ ┃ 6 ┃ ┃ ┊ ┊ ┃ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┃ ┃ ┃ ┃ ┊ ┃ ┏┻┓ ┃ ┃ ┊ 0.00┊ 5 2 3 4 0 1 ┊ 3 4 1 2 0 5 ┊ 4 0 3 1 2 5 ┊ 0 4 8 10 */ const char *multiroot_ex_nodes = "1 0.0 0 -1\n" "1 0.0 0 -1\n" "1 0.0 0 -1\n" "1 0.0 0 -1\n" "1 0.0 0 -1\n" "1 0.0 0 -1\n" "0 0.1 0 -1\n" "0 0.2 0 -1\n" "0 0.3 0 -1\n" "0 0.4 0 -1\n" "0 0.8 0 -1\n" "0 0.9 0 -1\n"; const char *multiroot_ex_edges = "8 10 6 0,3\n" "0 8 7 3\n" "0 10 7 4\n" "8 10 7 6\n" "4 10 8 1,2\n" "0 4 9 2\n" "0 10 9 7\n" "4 10 9 8\n" "0 4 10 0,1\n" "4 8 11 0,5\n"; /* We make one mutation over each root node */ const char *multiroot_ex_sites = "1.0 0\n" "2.0 0\n" "3.0 0\n" "5.0 0\n" "6.0 0\n" "8.0 0\n" "9.0 0\n"; const char *multiroot_ex_mutations = "0 10 1\n" "1 9 1\n" "2 5 1\n" "3 11 1\n" "4 9 1\n" "5 9 1\n" "6 5 1\n"; /*** An example of a empty tree sequence. ***/ const char *empty_ex_nodes = "1 0.0 0 -1\n" "1 0.0 0 -1\n" "1 0.0 0 -1\n" "1 0.0 0 -1\n" "1 0.0 0 -1\n" "1 0.0 0 -1\n"; const char *empty_ex_edges = ""; /*** An example of a tree sequence with missing marginal trees. ***/ /* | 4 | | 4 | | / \ | | / \ | | 3 \ | | / 3 | | / \ \ | | / / \ | | 0 1 2 | | 0 1 2 | |-|-----------|-|-----------|-| 0 1 2 3 4 5 */ const char *missing_ex_nodes = "1 0.0 0 -1\n" "1 0.0 0 -1\n" "1 0.0 0 -1\n" "0 1.0 0 -1\n" "0 2.0 0 -1\n"; const char *missing_ex_edges = "1.0 2.0 3 0\n" "1.0 2.0 3 1\n" "3.0 4.0 3 1\n" "3.0 4.0 3 2\n" "3.0 4.0 4 0\n" "1.0 2.0 4 2\n" "1.0 2.0 4 3\n" "3.0 4.0 4 3\n"; /* Simple utilities to parse text so we can write declaritive * tests. This is not intended as a robust general input mechanism. */ void parse_nodes(const char *text, tsk_node_table_t *node_table) { tsk_id_t ret_id; size_t c, k; size_t MAX_LINE = 1024; char line[MAX_LINE]; const char *whitespace = " \t"; char *p; double time; int flags, population, individual; char *name; c = 0; while (text[c] != '\0') { /* Fill in the line */ k = 0; while (text[c] != '\n' && text[c] != '\0') { CU_ASSERT_FATAL(k < MAX_LINE - 1); line[k] = text[c]; c++; k++; } if (text[c] == '\n') { c++; } line[k] = '\0'; p = strtok(line, whitespace); CU_ASSERT_FATAL(p != NULL); flags = atoi(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); time = atof(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); population = atoi(p); p = strtok(NULL, whitespace); if (p == NULL) { individual = -1; } else { individual = atoi(p); p = strtok(NULL, whitespace); } if (p == NULL) { name = ""; } else { name = p; } ret_id = tsk_node_table_add_row( node_table, flags, time, population, individual, name, strlen(name)); CU_ASSERT_FATAL(ret_id >= 0); } } void parse_edges(const char *text, tsk_edge_table_t *edge_table) { tsk_id_t ret_id; size_t c, k; size_t MAX_LINE = 1024; char line[MAX_LINE], sub_line[MAX_LINE]; const char *whitespace = " \t"; char *p, *q; double left, right; tsk_id_t parent, child; uint32_t num_children; c = 0; while (text[c] != '\0') { /* Fill in the line */ k = 0; while (text[c] != '\n' && text[c] != '\0') { CU_ASSERT_FATAL(k < MAX_LINE - 1); line[k] = text[c]; c++; k++; } if (text[c] == '\n') { c++; } line[k] = '\0'; p = strtok(line, whitespace); CU_ASSERT_FATAL(p != NULL); left = atof(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); right = atof(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); parent = atoi(p); num_children = 0; p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); num_children = 1; q = p; while (*q != '\0') { if (*q == ',') { num_children++; } q++; } CU_ASSERT_FATAL(num_children >= 1); strncpy(sub_line, p, MAX_LINE); q = strtok(sub_line, ","); for (k = 0; k < num_children; k++) { CU_ASSERT_FATAL(q != NULL); child = atoi(q); ret_id = tsk_edge_table_add_row( edge_table, left, right, parent, child, NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); q = strtok(NULL, ","); } CU_ASSERT_FATAL(q == NULL); } } void parse_migrations(const char *text, tsk_migration_table_t *migration_table) { tsk_id_t ret_id; size_t c, k; size_t MAX_LINE = 1024; char line[MAX_LINE]; const char *whitespace = " \t"; char *p; double left, right, time; int node, source, dest; char *metadata; c = 0; while (text[c] != '\0') { /* Fill in the line */ k = 0; while (text[c] != '\n' && text[c] != '\0') { CU_ASSERT_FATAL(k < MAX_LINE - 1); line[k] = text[c]; c++; k++; } if (text[c] == '\n') { c++; } line[k] = '\0'; p = strtok(line, whitespace); CU_ASSERT_FATAL(p != NULL); left = atof(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); right = atof(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); node = atoi(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); source = atoi(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); dest = atoi(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); time = atof(p); p = strtok(NULL, whitespace); if (p == NULL) { metadata = ""; } else { metadata = p; } ret_id = tsk_migration_table_add_row(migration_table, left, right, node, source, dest, time, metadata, strlen(metadata)); CU_ASSERT_FATAL(ret_id >= 0); } } void parse_sites(const char *text, tsk_site_table_t *site_table) { tsk_id_t ret_id; size_t c, k; size_t MAX_LINE = 1024; char line[MAX_LINE]; double position; char ancestral_state[MAX_LINE]; const char *whitespace = " \t"; char *p; c = 0; while (text[c] != '\0') { /* Fill in the line */ k = 0; while (text[c] != '\n' && text[c] != '\0') { CU_ASSERT_FATAL(k < MAX_LINE - 1); line[k] = text[c]; c++; k++; } if (text[c] == '\n') { c++; } line[k] = '\0'; p = strtok(line, whitespace); CU_ASSERT_FATAL(p != NULL); position = atof(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); strncpy(ancestral_state, p, MAX_LINE); ret_id = tsk_site_table_add_row( site_table, position, ancestral_state, strlen(ancestral_state), NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); } } void parse_mutations(const char *text, tsk_mutation_table_t *mutation_table) { tsk_id_t ret_id; size_t c, k; size_t MAX_LINE = 1024; char line[MAX_LINE]; const char *whitespace = " \t"; char *p; tsk_id_t node, site, parent; double time; char derived_state[MAX_LINE]; /* site, node, derived_state, [parent, time] */ c = 0; while (text[c] != '\0') { /* Fill in the line */ k = 0; while (text[c] != '\n' && text[c] != '\0') { CU_ASSERT_FATAL(k < MAX_LINE - 1); line[k] = text[c]; c++; k++; } if (text[c] == '\n') { c++; } line[k] = '\0'; p = strtok(line, whitespace); site = atoi(p); CU_ASSERT_FATAL(p != NULL); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); node = atoi(p); p = strtok(NULL, whitespace); CU_ASSERT_FATAL(p != NULL); strncpy(derived_state, p, MAX_LINE); parent = TSK_NULL; p = strtok(NULL, whitespace); if (p != NULL) { parent = atoi(p); } time = TSK_UNKNOWN_TIME; p = strtok(NULL, whitespace); if (p != NULL) { time = atof(p); } ret_id = tsk_mutation_table_add_row(mutation_table, site, node, parent, time, derived_state, strlen(derived_state), NULL, 0); CU_ASSERT_FATAL(ret_id >= 0); } } void parse_individuals(const char *text, tsk_individual_table_t *individual_table) { tsk_id_t ret_id; size_t c, k; size_t MAX_LINE = 1024; char line[MAX_LINE]; char sub_line[MAX_LINE]; const char *whitespace = " \t"; char *p, *q; char *p_cont, *q_cont; // re-entrant pointers for strtok_r double location[MAX_LINE]; int location_len; tsk_id_t parents[MAX_LINE]; int parents_len; int flags; char *name; c = 0; while (text[c] != '\0') { /* Fill in the line */ k = 0; while (text[c] != '\n' && text[c] != '\0') { CU_ASSERT_FATAL(k < MAX_LINE - 1); line[k] = text[c]; c++; k++; } if (text[c] == '\n') { c++; } line[k] = '\0'; p = strtok_r(line, whitespace, &p_cont); CU_ASSERT_FATAL(p != NULL); flags = atoi(p); p = strtok_r(NULL, whitespace, &p_cont); CU_ASSERT_FATAL(p != NULL); // the locations are comma-separated location_len = 1; q = p; while (*q != '\0') { if (*q == ',') { location_len++; } q++; } CU_ASSERT_FATAL(location_len >= 1); strncpy(sub_line, p, MAX_LINE); q = strtok_r(sub_line, ",", &q_cont); for (k = 0; k < location_len; k++) { CU_ASSERT_FATAL(q != NULL); location[k] = atof(q); q = strtok_r(NULL, ",", &q_cont); } CU_ASSERT_FATAL(q == NULL); /* parents and name are optional */ p = strtok_r(NULL, whitespace, &p_cont); parents_len = 0; name = ""; if (p != NULL) { // the parents are comma-separated parents_len = 1; q = p; while (*q != '\0') { if (*q == ',') { parents_len++; } q++; } CU_ASSERT_FATAL(parents_len >= 1); strncpy(sub_line, p, MAX_LINE); q = strtok_r(sub_line, ",", &q_cont); for (k = 0; k < parents_len; k++) { CU_ASSERT_FATAL(q != NULL); parents[k] = atoi(q); q = strtok_r(NULL, ",", &q_cont); } CU_ASSERT_FATAL(q == NULL); p = strtok_r(NULL, whitespace, &p_cont); if (p != NULL) { name = p; } } ret_id = tsk_individual_table_add_row(individual_table, flags, location, location_len, parents, parents_len, name, strlen(name)); CU_ASSERT_FATAL(ret_id >= 0); } } void tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nodes, const char *edges, const char *migrations, const char *sites, const char *mutations, const char *individuals, const char *provenance, tsk_flags_t tc_options) { int ret; tsk_id_t ret_id; tsk_table_collection_t tables; tsk_id_t max_population_id; tsk_size_t j; tsk_flags_t ts_flags; bool all_parents_null; CU_ASSERT_FATAL(ts != NULL); CU_ASSERT_FATAL(nodes != NULL); CU_ASSERT_FATAL(edges != NULL); /* Not supporting provenance here for now */ CU_ASSERT_FATAL(provenance == NULL); ret = tsk_table_collection_init(&tables, tc_options); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = sequence_length; parse_nodes(nodes, &tables.nodes); parse_edges(edges, &tables.edges); if (sites != NULL) { parse_sites(sites, &tables.sites); } if (mutations != NULL) { parse_mutations(mutations, &tables.mutations); } if (individuals != NULL) { parse_individuals(individuals, &tables.individuals); } if (migrations != NULL) { parse_migrations(migrations, &tables.migrations); } /* We need to add in populations if they are referenced */ max_population_id = -1; for (j = 0; j < tables.nodes.num_rows; j++) { max_population_id = TSK_MAX(max_population_id, tables.nodes.population[j]); } for (j = 0; j < tables.migrations.num_rows; j++) { max_population_id = TSK_MAX(max_population_id, tables.migrations.source[j]); max_population_id = TSK_MAX(max_population_id, tables.migrations.dest[j]); } if (max_population_id >= 0) { for (j = 0; j <= (tsk_size_t) max_population_id; j++) { ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret_id, j); } } /* If all mutation.parent are TSK_NULL, use TSK_TS_COMPUTE_MUTATION_PARENTS flag too */ ts_flags = TSK_TS_INIT_BUILD_INDEXES; all_parents_null = true; for (j = 0; j < tables.mutations.num_rows; j++) { if (tables.mutations.parent[j] != TSK_NULL) { all_parents_null = false; break; } } if (all_parents_null) { ts_flags |= TSK_TS_INIT_COMPUTE_MUTATION_PARENTS; } ret = tsk_treeseq_init(ts, &tables, ts_flags); /* tsk_treeseq_print_state(ts, stdout); */ if (ret != 0) { printf("\nret = %s\n", tsk_strerror(ret)); } CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tables); } /* Returns a tree sequence consisting of a single tree with n samples. This * is a full example of the data model, with values included for all fields. */ tsk_treeseq_t * caterpillar_tree(tsk_size_t n, tsk_size_t num_sites, tsk_size_t num_mutations) { int ret; tsk_id_t ret_id; tsk_treeseq_t *ts = tsk_malloc(sizeof(tsk_treeseq_t)); tsk_table_collection_t tables; tsk_id_t j, k, last_node, u; int state, m; double position[2]; tsk_id_t parents[2] = { -1, -1 }; const char *states[] = { "0", "1" }; const char *metadata[] = { "This", "is", "some", "metadata" }; const int num_metadatas = sizeof(metadata) / sizeof(*metadata); const char *metadata_schema = "mock metadata schema"; const char *ts_metadata = "This is a caterpillar tree"; const char *ts_metadata_schema = "The metadata is an example"; const char *prov_timestamp = "a timestamp, should be ISO8601"; const char *prov_record = "Produced by caterpillar_tree for testing purposes"; CU_ASSERT_FATAL(ts != NULL); ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(num_sites > 0 && num_mutations < n - 1); tables.sequence_length = 1.0; tsk_table_collection_set_metadata(&tables, ts_metadata, strlen(ts_metadata)); tsk_table_collection_set_metadata_schema( &tables, ts_metadata_schema, strlen(ts_metadata_schema)); tsk_reference_sequence_set_metadata_schema( &tables.reference_sequence, ts_metadata_schema, strlen(ts_metadata_schema)); tsk_reference_sequence_set_metadata( &tables.reference_sequence, ts_metadata, strlen(ts_metadata)); tsk_reference_sequence_set_data(&tables.reference_sequence, "A", 1); tsk_reference_sequence_set_url(&tables.reference_sequence, "B", 1); tsk_population_table_set_metadata_schema( &tables.populations, metadata_schema, strlen(metadata_schema)); tsk_individual_table_set_metadata_schema( &tables.individuals, metadata_schema, strlen(metadata_schema)); tsk_node_table_set_metadata_schema( &tables.nodes, metadata_schema, strlen(metadata_schema)); tsk_edge_table_set_metadata_schema( &tables.edges, metadata_schema, strlen(metadata_schema)); tsk_site_table_set_metadata_schema( &tables.sites, metadata_schema, strlen(metadata_schema)); tsk_mutation_table_set_metadata_schema( &tables.mutations, metadata_schema, strlen(metadata_schema)); tsk_migration_table_set_metadata_schema( &tables.migrations, metadata_schema, strlen(metadata_schema)); for (j = 0; j < (tsk_id_t) n; j++) { position[0] = j; position[1] = j; m = j % num_metadatas; ret_id = tsk_population_table_add_row( &tables.populations, metadata[m], strlen(metadata[m])); CU_ASSERT_EQUAL_FATAL(ret_id, j); ret_id = tsk_individual_table_add_row(&tables.individuals, 0, position, 2, parents, 2, metadata[m], strlen(metadata[m])); CU_ASSERT_EQUAL_FATAL(ret_id, j); ret_id = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0, j, j, metadata[m], strlen(metadata[m])); CU_ASSERT_EQUAL_FATAL(ret_id, j); } last_node = 0; for (j = 0; j < n - 1; j++) { m = j % num_metadatas; ret_id = tsk_node_table_add_row( &tables.nodes, 0, j + 1, j % n, TSK_NULL, metadata[m], strlen(metadata[m])); CU_ASSERT_FATAL(ret_id >= 0); u = ret_id; ret_id = tsk_edge_table_add_row( &tables.edges, 0, 1, u, last_node, metadata[m], strlen(metadata[m])); CU_ASSERT_FATAL(ret_id >= 0); ret_id = tsk_edge_table_add_row( &tables.edges, 0, 1, u, j + 1, metadata[m], strlen(metadata[m])); CU_ASSERT_FATAL(ret_id >= 0); last_node = u; } for (j = 0; j < num_sites; j++) { m = j % num_metadatas; ret_id = tsk_site_table_add_row(&tables.sites, (j + 1) / (double) n, states[0], strlen(states[0]), metadata[m], strlen(metadata[m])); CU_ASSERT_FATAL(ret_id >= 0); u = 2 * n - 3; state = 0; for (k = 0; k < num_mutations; k++) { m = k % num_metadatas; state = (state + 1) % 2; ret_id = tsk_mutation_table_add_row(&tables.mutations, j, u, TSK_NULL, tables.nodes.time[u], states[state], strlen(states[state]), metadata[m], strlen(metadata[m])); CU_ASSERT_FATAL(ret_id >= 0); u--; } } ret_id = tsk_provenance_table_add_row(&tables.provenances, prov_timestamp, strlen(prov_timestamp), prov_record, strlen(prov_record)); CU_ASSERT_EQUAL_FATAL(ret_id, 0); /* TODO make these consistent with the caterpillar tree topology. */ for (j = 0; j < n - 1; j++) { m = j % num_metadatas; ret_id = tsk_migration_table_add_row(&tables.migrations, 0, 1, j, j, j + 1, j + 1.5, metadata[m], strlen(metadata[m])); CU_ASSERT_FATAL(ret_id >= 0); } ret = tsk_table_collection_sort(&tables, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_compute_mutation_parents(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_treeseq_init(ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tables); return ts; } void unsort_edges(tsk_edge_table_t *edges, size_t start) { size_t j, k; size_t n = edges->num_rows - start; tsk_edge_t *buff = tsk_malloc(n * sizeof(tsk_edge_t)); CU_ASSERT_FATAL(buff != NULL); for (j = 0; j < n; j++) { k = start + j; buff[j].left = edges->left[k]; buff[j].right = edges->right[k]; buff[j].parent = edges->parent[k]; buff[j].child = edges->child[k]; } for (j = 0; j < n; j++) { k = start + j; edges->left[k] = buff[n - j - 1].left; edges->right[k] = buff[n - j - 1].right; edges->parent[k] = buff[n - j - 1].parent; edges->child[k] = buff[n - j - 1].child; } free(buff); } static int tskit_suite_init(void) { int fd = -1; static char template[] = "/tmp/tsk_c_test_XXXXXX"; _tmp_file_name = NULL; _devnull = NULL; _tmp_file_name = tsk_malloc(sizeof(template)); if (_tmp_file_name == NULL) { return CUE_NOMEMORY; } strcpy(_tmp_file_name, template); fd = mkstemp(_tmp_file_name); if (fd == -1) { return CUE_SINIT_FAILED; } close(fd); _devnull = fopen("/dev/null", "w"); if (_devnull == NULL) { return CUE_SINIT_FAILED; } return CUE_SUCCESS; } static int tskit_suite_cleanup(void) { if (_tmp_file_name != NULL) { unlink(_tmp_file_name); free(_tmp_file_name); } if (_devnull != NULL) { fclose(_devnull); } return CUE_SUCCESS; } static void handle_cunit_error(void) { fprintf(stderr, "CUnit error occured: %d: %s\n", CU_get_error(), CU_get_error_msg()); exit(EXIT_FAILURE); } int test_main(CU_TestInfo *tests, int argc, char **argv) { int ret; CU_pTest test; CU_pSuite suite; CU_SuiteInfo suites[] = { { .pName = "tskit", .pInitFunc = tskit_suite_init, .pCleanupFunc = tskit_suite_cleanup, .pTests = tests, }, CU_SUITE_INFO_NULL, }; if (CUE_SUCCESS != CU_initialize_registry()) { handle_cunit_error(); } if (CUE_SUCCESS != CU_register_suites(suites)) { handle_cunit_error(); } CU_basic_set_mode(CU_BRM_VERBOSE); if (argc == 1) { CU_basic_run_tests(); } else if (argc == 2) { suite = CU_get_suite_by_name("tskit", CU_get_registry()); if (suite == NULL) { printf("Suite not found\n"); return EXIT_FAILURE; } test = CU_get_test_by_name(argv[1], suite); if (test == NULL) { printf("Test '%s' not found\n", argv[1]); return EXIT_FAILURE; } CU_basic_run_test(suite, test); } else { printf("usage: %s \n", argv[0]); return EXIT_FAILURE; } ret = EXIT_SUCCESS; if (CU_get_number_of_tests_failed() != 0) { printf("Test failed!\n"); ret = EXIT_FAILURE; } CU_cleanup_registry(); return ret; } ================================================ FILE: c/tests/testlib.h ================================================ /* * MIT License * * Copyright (c) 2019-2024 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef __TESTLIB_H__ #define __TESTLIB_H__ #define _GNU_SOURCE #include #include #include #include #include /* Global variables used in the test suite */ extern char *_tmp_file_name; extern FILE *_devnull; int test_main(CU_TestInfo *tests, int argc, char **argv); void tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nodes, const char *edges, const char *migrations, const char *sites, const char *mutations, const char *individuals, const char *provenance, tsk_flags_t tc_options); tsk_treeseq_t *caterpillar_tree( tsk_size_t num_samples, tsk_size_t num_sites, tsk_size_t num_mutations); void parse_nodes(const char *text, tsk_node_table_t *node_table); void parse_edges(const char *text, tsk_edge_table_t *edge_table); void parse_sites(const char *text, tsk_site_table_t *site_table); void parse_mutations(const char *text, tsk_mutation_table_t *mutation_table); void parse_individuals(const char *text, tsk_individual_table_t *individual_table); void unsort_edges(tsk_edge_table_t *edges, size_t start); /* Use a macro so we can get line numbers at roughly the right place */ #define assert_arrays_almost_equal(len, a, b) \ { \ do { \ tsk_size_t _j; \ for (_j = 0; _j < len; _j++) { \ CU_ASSERT_DOUBLE_EQUAL(a[_j], b[_j], 1e-9); \ } \ } while (0); \ } #define assert_arrays_equal(len, a, b) \ { \ do { \ tsk_size_t _j; \ for (_j = 0; _j < len; _j++) { \ CU_ASSERT_EQUAL(a[_j], b[_j]); \ } \ } while (0); \ } /* Array equality if the arrays contain NaN values NB: the float cast for NaNs is for mingw, which complains without */ #define assert_arrays_almost_equal_nan(len, a, b) \ { \ do { \ tsk_size_t _j; \ for (_j = 0; _j < len; _j++) { \ if (isnan((float) a[_j]) || isnan((float) b[_j])) { \ CU_ASSERT_EQUAL_FATAL(isnan((float) a[_j]), isnan((float) b[_j])); \ } else { \ CU_ASSERT_DOUBLE_EQUAL(a[_j], b[_j], 1e-9); \ } \ } \ } while (0); \ } extern const char *single_tree_ex_nodes; extern const char *single_tree_ex_edges; extern const char *single_tree_ex_sites; extern const char *single_tree_ex_mutations; extern const char *multiple_tree_ex_nodes; extern const char *multiple_tree_ex_edges; extern const char *odd_tree1_ex_nodes; extern const char *odd_tree1_ex_edges; extern const char *multi_root_tree_ex_nodes; extern const char *multi_root_tree_ex_edges; extern const char *multi_path_tree_ex_nodes; extern const char *multi_path_tree_ex_edges; extern const char *nonbinary_ex_nodes; extern const char *nonbinary_ex_edges; extern const char *nonbinary_ex_sites; extern const char *nonbinary_ex_mutations; extern const char *unary_ex_nodes; extern const char *unary_ex_edges; extern const char *unary_ex_sites; extern const char *unary_ex_mutations; extern const char *internal_sample_ex_nodes; extern const char *internal_sample_ex_edges; extern const char *internal_sample_ex_sites; extern const char *internal_sample_ex_mutations; extern const char *multiroot_ex_nodes; extern const char *multiroot_ex_edges; extern const char *multiroot_ex_sites; extern const char *multiroot_ex_mutations; extern const char *empty_ex_nodes; extern const char *empty_ex_edges; extern const char *paper_ex_nodes; extern const char *paper_ex_edges; extern const char *paper_ex_sites; extern const char *paper_ex_mutations; extern const char *paper_ex_individuals; extern const char *missing_ex_nodes; extern const char *missing_ex_edges; #endif ================================================ FILE: c/tskit/convert.c ================================================ /* * MIT License * * Copyright (c) 2018-2025 Tskit Developers * Copyright (c) 2015-2017 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include /* ======================================================== * * Newick output. * ======================================================== */ /* This infrastructure is left-over from an earlier more complex version * of this algorithm that worked over a tree sequence and cached the newick * subtrees, updating according to diffs. It's unclear whether this complexity * was of any real-world use, since newick output for large trees is pretty * pointless. */ typedef struct { unsigned int precision; tsk_flags_t options; char *newick; tsk_id_t *traversal_stack; const tsk_tree_t *tree; } tsk_newick_converter_t; static int tsk_newick_converter_run( tsk_newick_converter_t *self, tsk_id_t root, size_t buffer_size, char *buffer) { int ret = TSK_ERR_GENERIC; const tsk_tree_t *tree = self->tree; tsk_id_t *stack = self->traversal_stack; const double *time = self->tree->tree_sequence->tables->nodes.time; const tsk_flags_t *flags = self->tree->tree_sequence->tables->nodes.flags; int stack_top = 0; int label; size_t s = 0; int r; tsk_id_t u, v, w, root_parent; double branch_length; bool ms_labels = self->options & TSK_NEWICK_LEGACY_MS_LABELS; const char *label_format = ms_labels ? "%d" : "n%d"; if (root < 0 || root >= (tsk_id_t) self->tree->num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (buffer == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } root_parent = tree->parent[root]; stack[0] = root; u = root_parent; while (stack_top >= 0) { v = stack[stack_top]; if (tree->left_child[v] != TSK_NULL && v != u) { if (s >= buffer_size) { ret = tsk_trace_error(TSK_ERR_BUFFER_OVERFLOW); goto out; } buffer[s] = '('; s++; for (w = tree->right_child[v]; w != TSK_NULL; w = tree->left_sib[w]) { stack_top++; stack[stack_top] = w; } } else { u = tree->parent[v]; stack_top--; label = -1; if (ms_labels) { if (tree->left_child[v] == TSK_NULL) { label = (int) v + 1; } } else if (flags[v] & TSK_NODE_IS_SAMPLE) { label = (int) v; } if (label != -1) { if (s >= buffer_size) { ret = tsk_trace_error(TSK_ERR_BUFFER_OVERFLOW); goto out; } r = snprintf(buffer + s, buffer_size - s, label_format, label); if (r < 0) { ret = tsk_trace_error(TSK_ERR_IO); goto out; } s += (size_t) r; if (s >= buffer_size) { ret = tsk_trace_error(TSK_ERR_BUFFER_OVERFLOW); goto out; } } if (u != root_parent) { branch_length = (time[u] - time[v]); r = snprintf(buffer + s, buffer_size - s, ":%.*f", (int) self->precision, branch_length); if (r < 0) { ret = tsk_trace_error(TSK_ERR_IO); goto out; } s += (size_t) r; if (s >= buffer_size) { ret = tsk_trace_error(TSK_ERR_BUFFER_OVERFLOW); goto out; } if (v == tree->right_child[u]) { buffer[s] = ')'; } else { buffer[s] = ','; } s++; } } } if ((s + 1) >= buffer_size) { ret = tsk_trace_error(TSK_ERR_BUFFER_OVERFLOW); goto out; } buffer[s] = ';'; buffer[s + 1] = '\0'; ret = 0; out: return ret; } static int tsk_newick_converter_init(tsk_newick_converter_t *self, const tsk_tree_t *tree, unsigned int precision, tsk_flags_t options) { int ret = 0; tsk_memset(self, 0, sizeof(tsk_newick_converter_t)); self->precision = precision; self->options = options; self->tree = tree; self->traversal_stack = tsk_malloc(tsk_tree_get_size_bound(tree) * sizeof(*self->traversal_stack)); if (self->traversal_stack == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } out: return ret; } static int tsk_newick_converter_free(tsk_newick_converter_t *self) { tsk_safe_free(self->traversal_stack); return 0; } int tsk_convert_newick(const tsk_tree_t *tree, tsk_id_t root, unsigned int precision, tsk_flags_t options, size_t buffer_size, char *buffer) { int ret = 0; tsk_newick_converter_t nc; ret = tsk_newick_converter_init(&nc, tree, precision, options); if (ret != 0) { goto out; } ret = tsk_newick_converter_run(&nc, root, buffer_size, buffer); out: tsk_newick_converter_free(&nc); return ret; } ================================================ FILE: c/tskit/convert.h ================================================ /* * MIT License * * Copyright (c) 2018-2021 Tskit Developers * Copyright (c) 2015-2017 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef TSK_CONVERT_H #define TSK_CONVERT_H #ifdef __cplusplus extern "C" { #endif #include #define TSK_NEWICK_LEGACY_MS_LABELS (1 << 0) int tsk_convert_newick(const tsk_tree_t *tree, tsk_id_t root, unsigned int precision, tsk_flags_t options, size_t buffer_size, char *buffer); #ifdef __cplusplus } #endif #endif ================================================ FILE: c/tskit/core.c ================================================ /* * MIT License * * Copyright (c) 2019-2025 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #define UUID_NUM_BYTES 16 #if defined(_WIN32) #include #include static int TSK_WARN_UNUSED get_random_bytes(uint8_t *buf) { /* Based on CPython's code in bootstrap_hash.c */ int ret = 0; HCRYPTPROV hCryptProv = (HCRYPTPROV) NULL; if (!CryptAcquireContext( &hCryptProv, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) { ret = tsk_trace_error(TSK_ERR_GENERATE_UUID); goto out; } if (!CryptGenRandom(hCryptProv, (DWORD) UUID_NUM_BYTES, buf)) { ret = tsk_trace_error(TSK_ERR_GENERATE_UUID); goto out; } if (!CryptReleaseContext(hCryptProv, 0)) { hCryptProv = (HCRYPTPROV) NULL; ret = tsk_trace_error(TSK_ERR_GENERATE_UUID); goto out; } hCryptProv = (HCRYPTPROV) NULL; out: if (hCryptProv != (HCRYPTPROV) NULL) { CryptReleaseContext(hCryptProv, 0); } return ret; } #else /* Assuming the existance of /dev/urandom on Unix platforms */ static int TSK_WARN_UNUSED get_random_bytes(uint8_t *buf) { int ret = 0; FILE *f = fopen("/dev/urandom", "r"); if (f == NULL) { ret = tsk_trace_error(TSK_ERR_GENERATE_UUID); goto out; } if (fread(buf, UUID_NUM_BYTES, 1, f) != 1) { ret = tsk_trace_error(TSK_ERR_GENERATE_UUID); goto out; } if (fclose(f) != 0) { ret = tsk_trace_error(TSK_ERR_GENERATE_UUID); goto out; } out: return ret; } #endif /* Generate a new UUID4 using a system-generated source of randomness. * Note that this function writes a NULL terminator to the end of this * string, so that the total length of the buffer must be 37 bytes. */ int tsk_generate_uuid(char *dest, int TSK_UNUSED(flags)) { int ret = 0; uint8_t buf[UUID_NUM_BYTES]; const char *pattern = "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x"; ret = get_random_bytes(buf); if (ret != 0) { goto out; } if (snprintf(dest, TSK_UUID_SIZE + 1, pattern, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11], buf[12], buf[13], buf[14], buf[15]) < 0) { ret = tsk_trace_error(TSK_ERR_GENERATE_UUID); goto out; } out: return ret; } static const char * tsk_strerror_internal(int err) { const char *ret = "Unknown error"; switch (err) { case 0: ret = "Normal exit condition. This is not an error!"; break; /* General errors */ case TSK_ERR_GENERIC: ret = "Generic error; please file a bug report. (TSK_ERR_GENERIC)"; break; case TSK_ERR_NO_MEMORY: ret = "Out of memory. (TSK_ERR_NO_MEMORY)"; break; case TSK_ERR_IO: if (errno != 0) { ret = strerror(errno); } else { ret = "Unspecified IO error"; } break; case TSK_ERR_BAD_PARAM_VALUE: ret = "Bad parameter value provided. (TSK_ERR_BAD_PARAM_VALUE)"; break; case TSK_ERR_BUFFER_OVERFLOW: ret = "Supplied buffer is too small. (TSK_ERR_BUFFER_OVERFLOW)"; break; case TSK_ERR_UNSUPPORTED_OPERATION: ret = "Operation cannot be performed in current configuration. " "(TSK_ERR_UNSUPPORTED_OPERATION)"; break; case TSK_ERR_GENERATE_UUID: ret = "Error generating UUID. (TSK_ERR_GENERATE_UUID)"; break; case TSK_ERR_EOF: ret = "End of file. (TSK_ERR_EOF)"; break; /* File format errors */ case TSK_ERR_FILE_FORMAT: ret = "File format error. (TSK_ERR_FILE_FORMAT)"; break; case TSK_ERR_FILE_VERSION_TOO_OLD: ret = "tskit file version too old. Please upgrade using the " "'tskit upgrade' command from tskit version<0.6.2. " "(TSK_ERR_FILE_VERSION_TOO_OLD)"; break; case TSK_ERR_FILE_VERSION_TOO_NEW: ret = "tskit file version is too new for this instance. " "Please upgrade tskit to the latest version. " "(TSK_ERR_FILE_VERSION_TOO_NEW)"; break; case TSK_ERR_REQUIRED_COL_NOT_FOUND: ret = "A required column was not found in the file. " "(TSK_ERR_REQUIRED_COL_NOT_FOUND)"; break; case TSK_ERR_BOTH_COLUMNS_REQUIRED: ret = "Both columns in a related pair must be provided. " "(TSK_ERR_BOTH_COLUMNS_REQUIRED)"; break; case TSK_ERR_BAD_COLUMN_TYPE: ret = "An incompatible type for a column was found in the file. " "(TSK_ERR_BAD_COLUMN_TYPE)"; break; /* Out of bounds errors */ case TSK_ERR_BAD_OFFSET: ret = "Bad offset provided in input array. (TSK_ERR_BAD_OFFSET)"; break; case TSK_ERR_NODE_OUT_OF_BOUNDS: ret = "Node out of bounds. (TSK_ERR_NODE_OUT_OF_BOUNDS)"; break; case TSK_ERR_EDGE_OUT_OF_BOUNDS: ret = "Edge out of bounds. (TSK_ERR_EDGE_OUT_OF_BOUNDS)"; break; case TSK_ERR_POPULATION_OUT_OF_BOUNDS: ret = "Population out of bounds. (TSK_ERR_POPULATION_OUT_OF_BOUNDS)"; break; case TSK_ERR_SITE_OUT_OF_BOUNDS: ret = "Site out of bounds. (TSK_ERR_SITE_OUT_OF_BOUNDS)"; break; case TSK_ERR_MUTATION_OUT_OF_BOUNDS: ret = "Mutation out of bounds. (TSK_ERR_MUTATION_OUT_OF_BOUNDS)"; break; case TSK_ERR_MIGRATION_OUT_OF_BOUNDS: ret = "Migration out of bounds. (TSK_ERR_MIGRATION_OUT_OF_BOUNDS)"; break; case TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS: ret = "Individual out of bounds. (TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS)"; break; case TSK_ERR_PROVENANCE_OUT_OF_BOUNDS: ret = "Provenance out of bounds. (TSK_ERR_PROVENANCE_OUT_OF_BOUNDS)"; break; case TSK_ERR_TIME_NONFINITE: ret = "Times must be finite. (TSK_ERR_TIME_NONFINITE)"; break; case TSK_ERR_GENOME_COORDS_NONFINITE: ret = "Genome coordinates must be finite numbers. " "(TSK_ERR_GENOME_COORDS_NONFINITE)"; break; case TSK_ERR_SEEK_OUT_OF_BOUNDS: ret = "Tree seek position out of bounds. (TSK_ERR_SEEK_OUT_OF_BOUNDS)"; break; case TSK_ERR_KEEP_ROWS_MAP_TO_DELETED: ret = "One of the kept rows in the table refers to a deleted row. " "(TSK_ERR_KEEP_ROWS_MAP_TO_DELETED)"; break; case TSK_ERR_POSITION_OUT_OF_BOUNDS: ret = "Position out of bounds. (TSK_ERR_POSITION_OUT_OF_BOUNDS)"; break; /* Edge errors */ case TSK_ERR_NULL_PARENT: ret = "Edge parent is null. (TSK_ERR_NULL_PARENT)"; break; case TSK_ERR_NULL_CHILD: ret = "Edge child is null. (TSK_ERR_NULL_CHILD)"; break; case TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME: ret = "Edges must be listed in (time[parent], child, left) order;" " time[parent] order violated. (TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME)"; break; case TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS: ret = "All edges for a given parent must be contiguous. " "(TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS)"; break; case TSK_ERR_EDGES_NOT_SORTED_CHILD: ret = "Edges must be listed in (time[parent], child, left) order;" " child order violated. (TSK_ERR_EDGES_NOT_SORTED_CHILD)"; break; case TSK_ERR_EDGES_NOT_SORTED_LEFT: ret = "Edges must be listed in (time[parent], child, left) order;" " left order violated. (TSK_ERR_EDGES_NOT_SORTED_LEFT)"; break; case TSK_ERR_BAD_NODE_TIME_ORDERING: ret = "time[parent] must be greater than time[child]. " "(TSK_ERR_BAD_NODE_TIME_ORDERING)"; break; case TSK_ERR_BAD_EDGE_INTERVAL: ret = "Bad edge interval where right <= left. (TSK_ERR_BAD_EDGE_INTERVAL)"; break; case TSK_ERR_DUPLICATE_EDGES: ret = "Duplicate edges provided. (TSK_ERR_DUPLICATE_EDGES)"; break; case TSK_ERR_RIGHT_GREATER_SEQ_LENGTH: ret = "Right coordinate > sequence length. " "(TSK_ERR_RIGHT_GREATER_SEQ_LENGTH)"; break; case TSK_ERR_LEFT_LESS_ZERO: ret = "Left coordinate must be >= 0. (TSK_ERR_LEFT_LESS_ZERO)"; break; case TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN: ret = "Bad edges: contradictory children for a given parent over " "an interval, or indexes need to be rebuilt. " "(TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN)"; break; case TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA: ret = "Can't squash, flush, simplify or link ancestors with edges that have " "non-empty metadata. Removing the metadata from the edges will allow " "these operations to proceed. For example using " "tables.edges.drop_metadata() in the tskit Python API. " "(TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA)"; break; /* Site errors */ case TSK_ERR_UNSORTED_SITES: ret = "Sites must be provided in strictly increasing position order. " "(TSK_ERR_UNSORTED_SITES)"; break; case TSK_ERR_DUPLICATE_SITE_POSITION: ret = "Duplicate site positions. (TSK_ERR_DUPLICATE_SITE_POSITION)"; break; case TSK_ERR_BAD_SITE_POSITION: ret = "Site positions must be between 0 and sequence_length. " "(TSK_ERR_BAD_SITE_POSITION)"; break; /* Mutation errors */ case TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE: ret = "Specified parent mutation is at a different site. " "(TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE)"; break; case TSK_ERR_MUTATION_PARENT_EQUAL: ret = "Parent mutation refers to itself. (TSK_ERR_MUTATION_PARENT_EQUAL)"; break; case TSK_ERR_MUTATION_PARENT_AFTER_CHILD: ret = "Parent mutation ID must be < current ID. " "(TSK_ERR_MUTATION_PARENT_AFTER_CHILD)"; break; case TSK_ERR_MUTATION_PARENT_INCONSISTENT: ret = "Mutation parent references form a loop. " "(TSK_ERR_MUTATION_PARENT_INCONSISTENT)"; break; case TSK_ERR_UNSORTED_MUTATIONS: ret = "Mutations must be provided in non-decreasing site order and " "non-increasing time order within each site. " "(TSK_ERR_UNSORTED_MUTATIONS)"; break; case TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE: ret = "A mutation's time must be >= the node time, or be marked as " "'unknown'. (TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE)"; break; case TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION: ret = "A mutation's time must be <= the parent mutation time (if known), or " "be marked as 'unknown'. " "(TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION)"; break; case TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE: ret = "A mutation's time must be < the parent node of the edge on which it " "occurs, or be marked as 'unknown'. " "(TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE)"; break; case TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN: ret = "Mutation times must either be all marked 'unknown', or all be known " "values for any single site. " "(TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN)"; break; case TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME: ret = "Some mutation times are marked 'unknown' for a method that requires " "no unknown times. (Use compute_mutation_times to add times?) " "(TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME)"; break; case TSK_ERR_BAD_MUTATION_PARENT: ret = "A mutation's parent is not consistent with the topology of the tree. " "Use compute_mutation_parents to set the parents correctly." "(TSK_ERR_BAD_MUTATION_PARENT)"; break; /* Migration errors */ case TSK_ERR_UNSORTED_MIGRATIONS: ret = "Migrations must be sorted by time. (TSK_ERR_UNSORTED_MIGRATIONS)"; break; /* Sample errors */ case TSK_ERR_DUPLICATE_SAMPLE: ret = "Duplicate sample value. (TSK_ERR_DUPLICATE_SAMPLE)"; break; case TSK_ERR_BAD_SAMPLES: ret = "The nodes provided are not samples. (TSK_ERR_BAD_SAMPLES)"; break; /* Table errors */ case TSK_ERR_BAD_TABLE_POSITION: ret = "Bad table position provided to truncate/reset. " "(TSK_ERR_BAD_TABLE_POSITION)"; break; case TSK_ERR_BAD_SEQUENCE_LENGTH: ret = "Sequence length must be > 0. (TSK_ERR_BAD_SEQUENCE_LENGTH)"; break; case TSK_ERR_TABLES_NOT_INDEXED: ret = "Table collection must be indexed. (TSK_ERR_TABLES_NOT_INDEXED)"; break; case TSK_ERR_TABLES_BAD_INDEXES: ret = "Table collection indexes inconsistent: do they need to be rebuilt? " "(TSK_ERR_TABLES_BAD_INDEXES)"; break; case TSK_ERR_TABLE_OVERFLOW: ret = "Table too large; cannot allocate more than 2**31 rows. This error " "is often caused by a lack of simplification when simulating. " "(TSK_ERR_TABLE_OVERFLOW)"; break; case TSK_ERR_COLUMN_OVERFLOW: ret = "Table column too large; cannot be more than 2**64 bytes. " "(TSK_ERR_COLUMN_OVERFLOW)"; break; case TSK_ERR_TREE_OVERFLOW: ret = "Too many trees; cannot be more than 2**31. (TSK_ERR_TREE_OVERFLOW)"; break; case TSK_ERR_METADATA_DISABLED: ret = "Metadata is disabled for this table, so cannot be set. " "(TSK_ERR_METADATA_DISABLED)"; break; /* Limitations */ case TSK_ERR_ONLY_INFINITE_SITES: ret = "Only infinite sites mutations are supported for this operation, " "i.e. at most a single mutation per site. " "(TSK_ERR_ONLY_INFINITE_SITES)"; break; case TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED: ret = "Migrations not currently supported by simplify. " "(TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED)"; break; case TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED: ret = "Migrations not currently supported by sort. " "(TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED)"; break; case TSK_ERR_SORT_OFFSET_NOT_SUPPORTED: ret = "Sort offsets for sites and mutations must be either 0 " "or the length of the respective tables. Intermediate values " "are not supported. (TSK_ERR_SORT_OFFSET_NOT_SUPPORTED)"; break; case TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED: ret = "Only binary mutations are supported for this operation. " "(TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED)"; break; case TSK_ERR_MIGRATIONS_NOT_SUPPORTED: ret = "Migrations not currently supported by this operation. " "(TSK_ERR_MIGRATIONS_NOT_SUPPORTED)"; break; case TSK_ERR_CANNOT_EXTEND_FROM_SELF: ret = "Tables can only be extended using rows from a different table. " "(TSK_ERR_CANNOT_EXTEND_FROM_SELF)"; break; case TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED: ret = "Silent mutations not supported by this operation. " "(TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED)"; break; case TSK_ERR_VARIANT_CANT_DECODE_COPY: ret = "Can't decode a copy of a variant. (TSK_ERR_VARIANT_CANT_DECODE_COPY)"; break; case TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA: ret = "A tree sequence can't take ownership of tables with " "TSK_NO_EDGE_METADATA. (TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA)"; break; case TSK_ERR_UNDEFINED_NONBINARY: ret = "Operation undefined for nonbinary trees. " "(TSK_ERR_UNDEFINED_NONBINARY)"; break; case TSK_ERR_UNDEFINED_MULTIROOT: ret = "Operation undefined for trees that are not singly-rooted. " "(TSK_ERR_UNDEFINED_MULTIROOT)"; break; /* Stats errors */ case TSK_ERR_BAD_NUM_WINDOWS: ret = "Must have at least one window, [0, L]. (TSK_ERR_BAD_NUM_WINDOWS)"; break; case TSK_ERR_BAD_WINDOWS: ret = "Windows must be increasing list [0, ..., L]. (TSK_ERR_BAD_WINDOWS)"; break; case TSK_ERR_MULTIPLE_STAT_MODES: ret = "Cannot specify more than one stats mode. " "(TSK_ERR_MULTIPLE_STAT_MODES)"; break; case TSK_ERR_BAD_STATE_DIMS: ret = "Must have state dimension >= 1. (TSK_ERR_BAD_STATE_DIMS)"; break; case TSK_ERR_BAD_RESULT_DIMS: ret = "Must have result dimension >= 1. (TSK_ERR_BAD_RESULT_DIMS)"; break; case TSK_ERR_INSUFFICIENT_SAMPLE_SETS: ret = "Insufficient sample sets provided. " "(TSK_ERR_INSUFFICIENT_SAMPLE_SETS)"; break; case TSK_ERR_INSUFFICIENT_INDEX_TUPLES: ret = "Insufficient sample set index tuples provided. " "(TSK_ERR_INSUFFICIENT_INDEX_TUPLES)"; break; case TSK_ERR_BAD_SAMPLE_SET_INDEX: ret = "Sample set index out of bounds. (TSK_ERR_BAD_SAMPLE_SET_INDEX)"; break; case TSK_ERR_EMPTY_SAMPLE_SET: ret = "Samples cannot be empty. (TSK_ERR_EMPTY_SAMPLE_SET)"; break; case TSK_ERR_UNSUPPORTED_STAT_MODE: ret = "Requested statistics mode not supported for this method. " "(TSK_ERR_UNSUPPORTED_STAT_MODE)"; break; case TSK_ERR_TIME_UNCALIBRATED: ret = "Statistics using branch lengths cannot be calculated when time_units " "is 'uncalibrated'. (TSK_ERR_TIME_UNCALIBRATED)"; break; case TSK_ERR_STAT_POLARISED_UNSUPPORTED: ret = "The TSK_STAT_POLARISED option is not supported by this statistic. " "(TSK_ERR_STAT_POLARISED_UNSUPPORTED)"; break; case TSK_ERR_STAT_SPAN_NORMALISE_UNSUPPORTED: ret = "The TSK_STAT_SPAN_NORMALISE option is not supported by this " "statistic. " "(TSK_ERR_STAT_SPAN_NORMALISE_UNSUPPORTED)"; break; case TSK_ERR_INSUFFICIENT_WEIGHTS: ret = "Insufficient weights provided (at least 1 required). " "(TSK_ERR_INSUFFICIENT_WEIGHTS)"; break; /* Pair coalescence errors */ case TSK_ERR_BAD_NODE_BIN_MAP: ret = "Node-to-bin map contains values less than TSK_NULL. " "(TSK_ERR_BAD_NODE_BIN_MAP)"; break; case TSK_ERR_BAD_NODE_BIN_MAP_DIM: ret = "Maximum index in node-to-bin map is greater than the " "output dimension. (TSK_ERR_BAD_NODE_BIN_MAP_DIM)"; break; case TSK_ERR_BAD_QUANTILES: ret = "Quantiles must be between 0 and 1 (inclusive) " "and strictly increasing. (TSK_ERR_BAD_QUANTILES)"; break; case TSK_ERR_UNSORTED_TIMES: ret = "Times must be strictly increasing. (TSK_ERR_UNSORTED_TIMES)"; break; case TSK_ERR_BAD_TIME_WINDOWS_DIM: ret = "Must have at least one time window. (TSK_ERR_BAD_TIME_WINDOWS_DIM)"; break; case TSK_ERR_BAD_SAMPLE_PAIR_TIMES: ret = "All sample times must be equal to the start of first time window. " "(TSK_ERR_BAD_SAMPLE_PAIR_TIMES)"; break; case TSK_ERR_BAD_TIME_WINDOWS: ret = "Time windows must start at zero and be strictly increasing. " "(TSK_ERR_BAD_TIME_WINDOWS)"; break; case TSK_ERR_BAD_TIME_WINDOWS_END: ret = "Time windows must end at infinity for this method. " "(TSK_ERR_BAD_TIME_WINDOWS_END)"; break; case TSK_ERR_BAD_NODE_TIME_WINDOW: ret = "Node time does not fall within assigned time window. " "(TSK_ERR_BAD_NODE_TIME_WINDOW)"; break; /* Two locus errors */ case TSK_ERR_STAT_UNSORTED_POSITIONS: ret = "The provided positions are not sorted in strictly increasing " "order. (TSK_ERR_STAT_UNSORTED_POSITIONS)"; break; case TSK_ERR_STAT_DUPLICATE_POSITIONS: ret = "The provided positions contain duplicates. " "(TSK_ERR_STAT_DUPLICATE_POSITIONS)"; break; case TSK_ERR_STAT_UNSORTED_SITES: ret = "The provided sites are not sorted in strictly increasing position " "order. (TSK_ERR_STAT_UNSORTED_SITES)"; break; case TSK_ERR_STAT_DUPLICATE_SITES: ret = "The provided sites contain duplicated entries. " "(TSK_ERR_STAT_DUPLICATE_SITES)"; break; /* Mutation mapping errors */ case TSK_ERR_GENOTYPES_ALL_MISSING: ret = "Must provide at least one non-missing genotype. " "(TSK_ERR_GENOTYPES_ALL_MISSING)"; break; case TSK_ERR_BAD_GENOTYPE: ret = "Bad genotype value provided. (TSK_ERR_BAD_GENOTYPE)"; break; case TSK_ERR_BAD_ANCESTRAL_STATE: ret = "Bad ancestral state specified. (TSK_ERR_BAD_ANCESTRAL_STATE)"; break; /* Genotype decoding errors */ case TSK_ERR_MUST_IMPUTE_NON_SAMPLES: ret = "Cannot generate genotypes for non-samples when isolated nodes are " "considered as missing. (TSK_ERR_MUST_IMPUTE_NON_SAMPLES)"; break; case TSK_ERR_ALLELE_NOT_FOUND: ret = "An allele was not found in the user-specified allele map. " "(TSK_ERR_ALLELE_NOT_FOUND)"; break; case TSK_ERR_TOO_MANY_ALLELES: ret = "Cannot have more than 2147483647 alleles (TSK_ERR_TOO_MANY_ALLELES)"; break; case TSK_ERR_ZERO_ALLELES: ret = "Must have at least one allele when specifying an allele map. " "(TSK_ERR_ZERO_ALLELES)"; break; case TSK_ERR_BAD_ALLELE_LENGTH: ret = "Alleles used when decoding alignments must have length one. " "(TSK_ERR_BAD_ALLELE_LENGTH)"; break; case TSK_ERR_MISSING_CHAR_COLLISION: ret = "Alleles used when decoding alignments must not match the missing " "data character. (TSK_ERR_MISSING_CHAR_COLLISION)"; break; /* Distance metric errors */ case TSK_ERR_SAMPLE_SIZE_MISMATCH: ret = "Cannot compare trees with different numbers of samples. " "(TSK_ERR_SAMPLE_SIZE_MISMATCH)"; break; case TSK_ERR_SAMPLES_NOT_EQUAL: ret = "Samples must be identical in trees to compare. " "(TSK_ERR_SAMPLES_NOT_EQUAL)"; break; case TSK_ERR_MULTIPLE_ROOTS: ret = "Trees with multiple roots not supported. (TSK_ERR_MULTIPLE_ROOTS)"; break; case TSK_ERR_UNARY_NODES: ret = "Unsimplified trees with unary nodes are not supported. " "(TSK_ERR_UNARY_NODES)"; break; case TSK_ERR_SEQUENCE_LENGTH_MISMATCH: ret = "Sequence lengths must be identical to compare. " "(TSK_ERR_SEQUENCE_LENGTH_MISMATCH)"; break; case TSK_ERR_NO_SAMPLE_LISTS: ret = "The sample_lists option must be enabled on the tree to perform this " "operation. Pass the option to the constructor or method that created " "the tree. (TSK_ERR_NO_SAMPLE_LISTS)"; break; /* Haplotype matching errors */ case TSK_ERR_NULL_VITERBI_MATRIX: ret = "Viterbi matrix has not filled. (TSK_ERR_NULL_VITERBI_MATRIX)"; break; case TSK_ERR_MATCH_IMPOSSIBLE: ret = "No matching haplotype exists with current parameters. " "(TSK_ERR_MATCH_IMPOSSIBLE)"; break; case TSK_ERR_BAD_COMPRESSED_MATRIX_NODE: ret = "The compressed matrix contains a node that subtends no samples. " "(TSK_ERR_BAD_COMPRESSED_MATRIX_NODE)"; break; case TSK_ERR_TOO_MANY_VALUES: ret = "Too many values to compress. (TSK_ERR_TOO_MANY_VALUES)"; break; /* Union errors */ case TSK_ERR_UNION_BAD_MAP: ret = "Node map contains an entry of a node not present in this table " "collection. (TSK_ERR_UNION_BAD_MAP)"; break; case TSK_ERR_UNION_DIFF_HISTORIES: // histories could be equivalent, because subset does not reorder // edges (if not sorted) or mutations. ret = "Shared portions of the tree sequences are not equal. " "(TSK_ERR_UNION_DIFF_HISTORIES)"; break; /* IBD errors */ case TSK_ERR_SAME_NODES_IN_PAIR: ret = "Both nodes in the sample pair are the same. " "(TSK_ERR_SAME_NODES_IN_PAIR)"; break; case TSK_ERR_IBD_PAIRS_NOT_STORED: ret = "The sample pairs are not stored by default in ibd_segments. Please " "add the TSK_IBD_STORE_PAIRS option flag if per-pair statistics are " "required. (TSK_ERR_IBD_PAIRS_NOT_STORED)"; break; case TSK_ERR_IBD_SEGMENTS_NOT_STORED: ret = "All segments are not stored by default in ibd_segments. Please " "add the TSK_IBD_STORE_SEGMENTS option flag if they are required. " "(TSK_ERR_IBD_SEGMENTS_NOT_STORED)"; break; /* Simplify errors */ case TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE: ret = "You cannot specify both TSK_SIMPLIFY_KEEP_UNARY and " "TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVDUALS. " "(TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE)"; break; /* Individual errors */ case TSK_ERR_UNSORTED_INDIVIDUALS: ret = "Individuals must be provided in an order where children are after " "their parent individuals (TSK_ERR_UNSORTED_INDIVIDUALS)"; break; case TSK_ERR_INDIVIDUAL_SELF_PARENT: ret = "Individuals cannot be their own parents. " "(TSK_ERR_INDIVIDUAL_SELF_PARENT)"; break; case TSK_ERR_INDIVIDUAL_PARENT_CYCLE: ret = "Individuals cannot be their own ancestor. " "(TSK_ERR_INDIVIDUAL_PARENT_CYCLE)"; break; case TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH: ret = "Individual populations cannot be returned " "if an individual has nodes from more than one population. " "(TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH)"; break; case TSK_ERR_INDIVIDUAL_TIME_MISMATCH: ret = "Individual times cannot be returned " "if an individual has nodes from more than one time. " "(TSK_ERR_INDIVIDUAL_TIME_MISMATCH)"; break; case TSK_ERR_EXTEND_EDGES_BAD_MAXITER: ret = "Maximum number of iterations must be positive. " "(TSK_ERR_EXTEND_EDGES_BAD_MAXITER)"; break; } return ret; } int tsk_set_kas_error(int err) { if (err == KAS_ERR_IO) { /* If we've detected an IO error, report it as TSK_ERR_IO so that we have * a consistent error code covering these situtations */ return TSK_ERR_IO; } else { /* Flip this bit. As the error is negative, this sets the bit to 0 */ return err ^ (1 << TSK_KAS_ERR_BIT); } } bool tsk_is_kas_error(int err) { return !(err & (1 << TSK_KAS_ERR_BIT)); } int tsk_get_kas_error(int err) { return err ^ (1 << TSK_KAS_ERR_BIT); } const char * tsk_strerror(int err) { if (err != 0 && tsk_is_kas_error(err)) { return kas_strerror(tsk_get_kas_error(err)); } else { return tsk_strerror_internal(err); } } void __tsk_safe_free(void **ptr) { if (ptr != NULL) { if (*ptr != NULL) { free(*ptr); *ptr = NULL; } } } /* Block allocator. Simple allocator when we lots of chunks of memory * and don't need to free them individually. */ void tsk_blkalloc_print_state(tsk_blkalloc_t *self, FILE *out) { fprintf(out, "Block allocator%p::\n", (void *) self); fprintf(out, "\ttop = %lld\n", (long long) self->top); fprintf(out, "\tchunk_size = %lld\n", (long long) self->chunk_size); fprintf(out, "\tnum_chunks = %lld\n", (long long) self->num_chunks); fprintf(out, "\ttotal_allocated = %lld\n", (long long) self->total_allocated); fprintf(out, "\ttotal_size = %lld\n", (long long) self->total_size); } int TSK_WARN_UNUSED tsk_blkalloc_reset(tsk_blkalloc_t *self) { int ret = 0; self->top = 0; self->current_chunk = 0; self->total_allocated = 0; return ret; } int TSK_WARN_UNUSED tsk_blkalloc_init(tsk_blkalloc_t *self, size_t chunk_size) { int ret = 0; tsk_memset(self, 0, sizeof(tsk_blkalloc_t)); if (chunk_size < 1) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } self->chunk_size = chunk_size; self->top = 0; self->current_chunk = 0; self->total_allocated = 0; self->total_size = 0; self->num_chunks = 0; self->mem_chunks = malloc(sizeof(char *)); if (self->mem_chunks == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->mem_chunks[0] = malloc(chunk_size); if (self->mem_chunks[0] == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->num_chunks = 1; self->total_size = chunk_size + sizeof(void *); out: return ret; } void *TSK_WARN_UNUSED tsk_blkalloc_get(tsk_blkalloc_t *self, size_t size) { void *ret = NULL; void *p; if (size > self->chunk_size) { goto out; } if ((self->top + size) > self->chunk_size) { if (self->current_chunk == (self->num_chunks - 1)) { p = realloc(self->mem_chunks, (self->num_chunks + 1) * sizeof(void *)); if (p == NULL) { goto out; } self->mem_chunks = p; p = malloc(self->chunk_size); if (p == NULL) { goto out; } self->mem_chunks[self->num_chunks] = p; self->num_chunks++; self->total_size += self->chunk_size + sizeof(void *); } self->current_chunk++; self->top = 0; } ret = self->mem_chunks[self->current_chunk] + self->top; self->top += size; self->total_allocated += size; out: return ret; } void tsk_blkalloc_free(tsk_blkalloc_t *self) { size_t j; for (j = 0; j < self->num_chunks; j++) { if (self->mem_chunks[j] != NULL) { free(self->mem_chunks[j]); } } if (self->mem_chunks != NULL) { free(self->mem_chunks); } } /* Mirrors the semantics of numpy's searchsorted function. Uses binary * search to find the index of the closest value in the array. */ tsk_size_t tsk_search_sorted(const double *restrict array, tsk_size_t size, double value) { int64_t upper = (int64_t) size; int64_t lower = 0; int64_t offset = 0; int64_t mid; if (upper == 0) { return 0; } while (upper - lower > 1) { mid = (upper + lower) / 2; if (value >= array[mid]) { lower = mid; } else { upper = mid; } } offset = (int64_t) (array[lower] < value); return (tsk_size_t) (lower + offset); } /* Rounds the specified double to the closest multiple of 10**-num_digits. If * num_digits > 22, return value without changes. This is intended for use with * small positive numbers; behaviour with large inputs has not been considered. * * Based on double_round from the Python standard library * https://github.com/python/cpython/blob/master/Objects/floatobject.c#L985 */ double tsk_round(double x, unsigned int ndigits) { double pow1, y, z; z = x; if (ndigits < 22) { pow1 = pow(10.0, (double) ndigits); y = x * pow1; z = round(y); if (fabs(y - z) == 0.5) { /* halfway between two integers; use round-half-even */ z = 2.0 * round(y / 2.0); } z = z / pow1; } return z; } /* As NANs are not equal, use this function to check for equality to TSK_UNKNOWN_TIME */ bool tsk_is_unknown_time(double val) { union { uint64_t i; double f; } nan_union; nan_union.f = val; return nan_union.i == TSK_UNKNOWN_TIME_HEX; } /* Work around a bug which seems to show up on various mixtures of * compiler and libc versions, where isfinite and isnan result in * spurious warnings about casting down to float. The original issue * is here: * https://github.com/tskit-dev/tskit/issues/721 * * The simplest approach seems to be to use the builtins where they * are available (clang and gcc), and to use the library macro * otherwise. There would be no disadvantage to using the builtin * version, so there's no real harm in this approach. */ bool tsk_isnan(double val) { #if defined(__GNUC__) return __builtin_isnan(val); #else return isnan(val); #endif } bool tsk_isfinite(double val) { #if defined(__GNUC__) return __builtin_isfinite(val); #else return isfinite(val); #endif } void * tsk_malloc(tsk_size_t size) { /* Avoid malloc(0) as it's not portable */ if (size == 0) { size = 1; } #if TSK_MAX_SIZE > SIZE_MAX if (size > SIZE_MAX) { return NULL; } #endif return malloc((size_t) size); } void * tsk_realloc(void *ptr, tsk_size_t size) { /* We shouldn't ever realloc to a zero size in tskit */ tsk_bug_assert(size > 0); return realloc(ptr, (size_t) size); } /* We keep the size argument here as a size_t because we'd have to * cast the outputs of sizeof() otherwise, which would lead to * less readable code. We need to be careful to use calloc within * the library accordingly, so that size can't overflow on 32 bit. */ void * tsk_calloc(tsk_size_t n, size_t size) { /* Avoid calloc(0) as it's not portable */ if (n == 0) { n = 1; } #if TSK_MAX_SIZE > SIZE_MAX if (n > SIZE_MAX) { return NULL; } #endif return calloc((size_t) n, size); } void * tsk_memset(void *ptr, int fill, tsk_size_t size) { return memset(ptr, fill, (size_t) size); } void * tsk_memcpy(void *dest, const void *src, tsk_size_t size) { return memcpy(dest, src, (size_t) size); } void * tsk_memmove(void *dest, const void *src, tsk_size_t size) { return memmove(dest, src, (size_t) size); } int tsk_memcmp(const void *s1, const void *s2, tsk_size_t size) { return memcmp(s1, s2, (size_t) size); } /* We can't initialise the stream to its real default value because * of limitations on static initialisers. To work around this, we initialise * it to NULL and then set the value to the required standard stream * when called. */ FILE *_tsk_debug_stream = NULL; void tsk_set_debug_stream(FILE *f) { _tsk_debug_stream = f; } FILE * tsk_get_debug_stream(void) { if (_tsk_debug_stream == NULL) { _tsk_debug_stream = TSK_DEFAULT_DEBUG_STREAM; } return _tsk_debug_stream; } /* AVL Tree implementation. This is based directly on Knuth's implementation * in TAOCP. See the python/tests/test_avl_tree.py for more information, * and equivalent code annotated with the original algorithm listing. */ static void tsk_avl_tree_int_print_node(tsk_avl_node_int_t *node, int depth, FILE *out) { int d; if (node == NULL) { return; } for (d = 0; d < depth; d++) { fprintf(out, " "); } fprintf(out, "key=%d balance=%d\n", (int) node->key, node->balance); tsk_avl_tree_int_print_node(node->llink, depth + 1, out); tsk_avl_tree_int_print_node(node->rlink, depth + 1, out); } void tsk_avl_tree_int_print_state(tsk_avl_tree_int_t *self, FILE *out) { fprintf(out, "AVL tree: size=%d height=%d\n", (int) self->size, (int) self->height); tsk_avl_tree_int_print_node(self->head.rlink, 0, out); } int tsk_avl_tree_int_init(tsk_avl_tree_int_t *self) { memset(self, 0, sizeof(*self)); return 0; } int tsk_avl_tree_int_free(tsk_avl_tree_int_t *TSK_UNUSED(self)) { return 0; } tsk_avl_node_int_t * tsk_avl_tree_int_get_root(const tsk_avl_tree_int_t *self) { return self->head.rlink; } tsk_avl_node_int_t * tsk_avl_tree_int_search(const tsk_avl_tree_int_t *self, int64_t key) { tsk_avl_node_int_t *P = self->head.rlink; while (P != NULL) { if (key == P->key) { break; } else if (key < P->key) { P = P->llink; } else { P = P->rlink; } } return P; } static int tsk_avl_tree_int_insert_empty(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node) { self->head.rlink = node; self->size = 1; self->height = 1; node->llink = NULL; node->rlink = NULL; node->balance = 0; return 0; } #define get_link(a, P) ((a) == -1 ? (P)->llink : (P)->rlink) #define set_link(a, P, val) \ do { \ if ((a) == -1) { \ (P)->llink = val; \ } else { \ (P)->rlink = val; \ } \ } while (0); static int tsk_avl_tree_int_insert_non_empty(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node) { const int64_t K = node->key; tsk_avl_node_int_t *T = &self->head; tsk_avl_node_int_t *S = T->rlink; tsk_avl_node_int_t *P = T->rlink; tsk_avl_node_int_t *Q, *R; int a; while (true) { if (K == P->key) { /* TODO figure out what the most useful semantics are here. Just * returning 1 as a non-zero value for now. */ return 1; } else if (K < P->key) { Q = P->llink; if (Q == NULL) { Q = node; P->llink = Q; break; } } else { Q = P->rlink; if (Q == NULL) { Q = node; P->rlink = Q; break; } } if (Q->balance != 0) { T = P; S = Q; } P = Q; } self->size++; Q->llink = NULL; Q->rlink = NULL; Q->balance = 0; if (K < S->key) { a = -1; } else { a = 1; } P = get_link(a, S); R = P; while (P != Q) { if (K < P->key) { P->balance = -1; P = P->llink; } else if (K > P->key) { P->balance = 1; P = P->rlink; } } if (S->balance == 0) { S->balance = a; self->height++; } else if (S->balance == -a) { S->balance = 0; } else { if (R->balance == a) { P = R; set_link(a, S, get_link(-a, R)); set_link(-a, R, S); S->balance = 0; R->balance = 0; } else if (R->balance == -a) { P = get_link(-a, R); set_link(-a, R, get_link(a, P)); set_link(a, P, R); set_link(a, S, get_link(-a, P)); set_link(-a, P, S); if (P->balance == a) { S->balance = -a; R->balance = 0; } else if (P->balance == 0) { S->balance = 0; R->balance = 0; } else { S->balance = 0; R->balance = a; } P->balance = 0; } if (S == T->rlink) { T->rlink = P; } else { T->llink = P; } } return 0; } int tsk_avl_tree_int_insert(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node) { int ret = 0; if (self->size == 0) { ret = tsk_avl_tree_int_insert_empty(self, node); } else { ret = tsk_avl_tree_int_insert_non_empty(self, node); } return ret; } /* An inorder traversal of the nodes in an AVL tree (or any binary search tree) * yields the keys in sorted order. The recursive implementation is safe here * because this is an AVL tree and it is strictly balanced, the depth is very * limited. Using GCC's __builtin_frame_address it looks like the size of a stack * frame for this function is 48 bytes. Assuming a stack size of 1MiB, this * would give us a maximum tree depth of 21845 - so, we're pretty safe. */ static int ordered_nodes_traverse(tsk_avl_node_int_t *node, int index, tsk_avl_node_int_t **out) { if (node == NULL) { return index; } index = ordered_nodes_traverse(node->llink, index, out); out[index] = node; return ordered_nodes_traverse(node->rlink, index + 1, out); } int tsk_avl_tree_int_ordered_nodes(const tsk_avl_tree_int_t *self, tsk_avl_node_int_t **out) { ordered_nodes_traverse(self->head.rlink, 0, out); return 0; } // Bit Array implementation. Allows us to store unsigned integers in a compact manner. // Currently implemented as an array of 32-bit unsigned integers. int tsk_bitset_init(tsk_bitset_t *self, tsk_size_t num_bits, tsk_size_t length) { int ret = 0; self->row_len = (num_bits / TSK_BITSET_BITS) + (num_bits % TSK_BITSET_BITS ? 1 : 0); self->len = length; self->data = tsk_calloc(self->row_len * length, sizeof(*self->data)); if (self->data == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } out: return ret; } #define BITSET_DATA_ROW(bs, row) ((bs)->data + (row) * (bs)->row_len) void tsk_bitset_intersect(const tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, tsk_size_t other_row, tsk_bitset_t *out) { const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row); const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row); tsk_bitset_val_t *restrict out_d = out->data; for (tsk_size_t i = 0; i < self->row_len; i++) { out_d[i] = self_d[i] & other_d[i]; } } void tsk_bitset_subtract(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, tsk_size_t other_row) { tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row); const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row); for (tsk_size_t i = 0; i < self->row_len; i++) { self_d[i] &= ~(other_d[i]); } } void tsk_bitset_union(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, tsk_size_t other_row) { tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, self_row); const tsk_bitset_val_t *restrict other_d = BITSET_DATA_ROW(other, other_row); for (tsk_size_t i = 0; i < self->row_len; i++) { self_d[i] |= other_d[i]; } } void tsk_bitset_set_bit(tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit) { tsk_bitset_val_t i = (bit / TSK_BITSET_BITS); *(BITSET_DATA_ROW(self, row) + i) |= (tsk_bitset_val_t) 1 << (bit - (TSK_BITSET_BITS * i)); } bool tsk_bitset_contains(const tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit) { tsk_bitset_val_t i = (bit / TSK_BITSET_BITS); return *(BITSET_DATA_ROW(self, row) + i) & ((tsk_bitset_val_t) 1 << (bit - (TSK_BITSET_BITS * i))); } static inline uint32_t popcount(tsk_bitset_val_t v) { // Utilizes 12 operations per chunk. NB this only works on 32 bit integers. // Taken from: // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel // There's a nice breakdown of this algorithm here: // https://stackoverflow.com/a/109025 // // The gcc/clang compiler flag will -mpopcnt will convert this code to a // popcnt instruction (most if not all modern CPUs will support this). The // popcnt instruction will yield some speed improvements, which depend on // the tree sequence. // // NB: 32bit counting is typically faster than 64bit counting for this task. // (at least on x86-64) v = v - ((v >> 1) & 0x55555555); v = (v & 0x33333333) + ((v >> 2) & 0x33333333); return (((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; } tsk_size_t tsk_bitset_count(const tsk_bitset_t *self, tsk_size_t row) { tsk_size_t i = 0; tsk_size_t count = 0; const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, row); for (i = 0; i < self->row_len; i++) { count += popcount(self_d[i]); } return count; } void tsk_bitset_get_items( const tsk_bitset_t *self, tsk_size_t row, tsk_id_t *items, tsk_size_t *n_items) { // Get the items stored in the row of a bitset. // Uses a de Bruijn sequence lookup table to determine the lowest bit set. // See the wikipedia article for more info: https://w.wiki/BYiF tsk_size_t i, n, off; tsk_bitset_val_t v, lsb; // least significant bit static const tsk_id_t lookup[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; const tsk_bitset_val_t *restrict self_d = BITSET_DATA_ROW(self, row); n = 0; for (i = 0; i < self->row_len; i++) { v = self_d[i]; off = i * TSK_BITSET_BITS; if (v == 0) { continue; } while ((lsb = v & -v)) { items[n] = lookup[(lsb * 0x077cb531U) >> 27] + (tsk_id_t) off; n++; v ^= lsb; } } *n_items = n; } void tsk_bitset_free(tsk_bitset_t *self) { tsk_safe_free(self->data); } ================================================ FILE: c/tskit/core.h ================================================ /* * MIT License * * Copyright (c) 2019-2025 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /** * @file core.h * @brief Core utilities used in all of tskit. */ #ifndef __TSK_CORE_H__ #define __TSK_CORE_H__ #ifdef __cplusplus extern "C" { #endif #include #include #include #include #include #ifdef __GNUC__ #define TSK_WARN_UNUSED __attribute__((warn_unused_result)) #define TSK_UNUSED(x) TSK_UNUSED_##x __attribute__((__unused__)) #else #define TSK_WARN_UNUSED #define TSK_UNUSED(x) TSK_UNUSED_##x /* Don't bother with restrict for MSVC */ #define restrict #endif /* We assume CHAR_BIT == 8 when loading strings from 8-bit byte arrays */ #if CHAR_BIT != 8 #error CHAR_BIT MUST EQUAL 8 #endif /* This sets up TSK_DBL_DECIMAL_DIG, which can then be used as a * precision specifier when writing out doubles, if you want sufficient * decimal digits to be written to guarantee a lossless round-trip * after being read back in. Usage: * * printf("%.*g", TSK_DBL_DECIMAL_DIG, foo); * * See https://stackoverflow.com/a/19897395/2752221 */ #ifdef DBL_DECIMAL_DIG #define TSK_DBL_DECIMAL_DIG (DBL_DECIMAL_DIG) #else #define TSK_DBL_DECIMAL_DIG (DBL_DIG + 3) #endif /** @brief Tskit Object IDs. @rst All objects in tskit are referred to by integer IDs corresponding to the row they occupy in the relevant table. The ``tsk_id_t`` type should be used when manipulating these ID values. The reserved value :c:macro:`TSK_NULL` (-1) defines missing data. @endrst */ #ifdef _TSK_BIG_TABLES /* Allow tables to have more than 2^31 rows. This is an EXPERIMENTAL feature * and is not supported in any way. This typedef is only included for * future-proofing purposes, so that we can be sure that we don't make any * design decisions that are incompatible with big tables by building the * library in 64 bit mode in CI. See the discussion here for more background: * https://github.com/tskit-dev/tskit/issues/343 * * If you need big tables, please open an issue on GitHub to discuss, or comment * on the thread above. */ typedef int64_t tsk_id_t; #define TSK_MAX_ID INT64_MAX - 1 #define TSK_ID_STORAGE_TYPE KAS_INT64 #else typedef int32_t tsk_id_t; #define TSK_MAX_ID INT32_MAX - 1 #define TSK_ID_STORAGE_TYPE KAS_INT32 #endif /** @brief Tskit sizes. @rst The ``tsk_size_t`` type is an unsigned integer used for any size or count value. @endrst */ typedef uint64_t tsk_size_t; #define TSK_MAX_SIZE UINT64_MAX #define TSK_SIZE_STORAGE_TYPE KAS_UINT64 /** @brief Container for bitwise flags. @rst Bitwise flags are used in tskit as a column type and also as a way to specify options to API functions. @endrst */ typedef uint32_t tsk_flags_t; #define TSK_FLAGS_STORAGE_TYPE KAS_UINT32 /** @brief Boolean type. @rst Fixed-size (1 byte) boolean values. @endrst */ typedef uint8_t tsk_bool_t; // clang-format off /** @defgroup API_VERSION_GROUP API version macros. @{ */ /** The library major version. Incremented when breaking changes to the API or ABI are introduced. This includes any changes to the signatures of functions and the sizes and types of externally visible structs. */ #define TSK_VERSION_MAJOR 1 /** The library minor version. Incremented when non-breaking backward-compatible changes to the API or ABI are introduced, i.e., the addition of a new function. */ #define TSK_VERSION_MINOR 3 /** The library patch version. Incremented when any changes not relevant to the to the API or ABI are introduced, i.e., internal refactors of bugfixes. */ #define TSK_VERSION_PATCH 1 /** @} */ /* We define a specific NAN value for default mutation time which indicates the time is unknown. We use a specific value so that if mutation time is set to a NAN from a computation we can reject it. This specific value is a non-signalling NAN with the last six fraction bytes set to the ascii of "tskit!" */ #define TSK_UNKNOWN_TIME_HEX 0x7FF874736B697421ULL static inline double __tsk_nan_f(void) { const union { uint64_t i; double f; } nan_union = { .i = TSK_UNKNOWN_TIME_HEX }; return nan_union.f; } /** @defgroup GENERIC_CONSTANTS General options flags used in some functions. @{ */ /** Used in node flags to indicate that a node is a sample node. */ #define TSK_NODE_IS_SAMPLE 1u /** Null value used for cases such as absent id references. */ #define TSK_NULL ((tsk_id_t) -1) /** Value used for missing data in genotype arrays. */ #define TSK_MISSING_DATA (-1) /** Value to indicate that a time is unknown. Note that this value is a non-signalling NAN whose representation differs from the NAN generated by computations such as divide by zeros. */ #define TSK_UNKNOWN_TIME __tsk_nan_f() /** @} */ #define TSK_TIME_UNITS_UNKNOWN "unknown" #define TSK_TIME_UNITS_UNCALIBRATED "uncalibrated" #define TSK_FILE_FORMAT_NAME "tskit.trees" #define TSK_FILE_FORMAT_NAME_LENGTH 11 #define TSK_FILE_FORMAT_VERSION_MAJOR 12 #define TSK_FILE_FORMAT_VERSION_MINOR 7 /** @defgroup GENERIC_FUNCTION_OPTIONS General options flags used in some functions. @{ */ /* Place the common options at the top of the space; this way we can start options for individual functions at the bottom without worrying about clashing with the common options */ /** Turn on debugging output. Not supported by all functions. */ #define TSK_DEBUG (1u << 31) /** Do not initialise the parameter object. */ #define TSK_NO_INIT (1u << 30) /** Do not run integrity checks before performing an operation. This performance optimisation should not be used unless the calling code can guarantee reference integrity within the table collection. References to rows not in the table or bad offsets will result in undefined behaviour. */ #define TSK_NO_CHECK_INTEGRITY (1u << 29) /** Instead of taking a copy of input objects, the function should take ownership of them and manage their lifecycle. The caller specifying this flag should no longer modify or free the object or objects passed. See individual functions using this flag for what object it applies to. */ #define TSK_TAKE_OWNERSHIP (1u << 28) /** @} */ /** @defgroup GENERAL_ERROR_GROUP General errors. @{ */ /** Generic error thrown when no other message can be generated. */ #define TSK_ERR_GENERIC -1 /** Memory could not be allocated. */ #define TSK_ERR_NO_MEMORY -2 /** An IO error occurred. */ #define TSK_ERR_IO -3 #define TSK_ERR_BAD_PARAM_VALUE -4 #define TSK_ERR_BUFFER_OVERFLOW -5 #define TSK_ERR_UNSUPPORTED_OPERATION -6 #define TSK_ERR_GENERATE_UUID -7 /** The file stream ended after reading zero bytes. */ #define TSK_ERR_EOF -8 /** @} */ /** @defgroup FILE_FORMAT_ERROR_GROUP File format errors. @{ */ /** A file could not be read because it is in the wrong format */ #define TSK_ERR_FILE_FORMAT -100 /** The file is in tskit format, but the version is too old for the library to read. The file should be upgraded to the latest version using the ``tskit upgrade`` command line utility from tskit version<0.6.2. */ #define TSK_ERR_FILE_VERSION_TOO_OLD -101 /** The file is in tskit format, but the version is too new for the library to read. To read the file you must upgrade the version of tskit. */ #define TSK_ERR_FILE_VERSION_TOO_NEW -102 /** A column that is a required member of a table was not found in the file. */ #define TSK_ERR_REQUIRED_COL_NOT_FOUND -103 /** One of a pair of columns that must be specified together was not found in the file. */ #define TSK_ERR_BOTH_COLUMNS_REQUIRED -104 /** An unsupported type was provided for a column in the file. */ #define TSK_ERR_BAD_COLUMN_TYPE -105 /** @} */ /** @defgroup OOB_ERROR_GROUP Out of bounds errors. @{ */ /** A bad value was provided for a ragged column offset, values should start at zero and be monotonically increasing. */ #define TSK_ERR_BAD_OFFSET -200 /** A position to seek to was less than zero or greater than the length of the genome */ #define TSK_ERR_SEEK_OUT_OF_BOUNDS -201 /** A node id was less than zero or greater than the final index */ #define TSK_ERR_NODE_OUT_OF_BOUNDS -202 /** A edge id was less than zero or greater than the final index */ #define TSK_ERR_EDGE_OUT_OF_BOUNDS -203 /** A population id was less than zero or greater than the final index */ #define TSK_ERR_POPULATION_OUT_OF_BOUNDS -204 /** A site id was less than zero or greater than the final index */ #define TSK_ERR_SITE_OUT_OF_BOUNDS -205 /** A mutation id was less than zero or greater than the final index */ #define TSK_ERR_MUTATION_OUT_OF_BOUNDS -206 /** An individual id was less than zero or greater than the final index */ #define TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS -207 /** A migration id was less than zero or greater than the final index */ #define TSK_ERR_MIGRATION_OUT_OF_BOUNDS -208 /** A provenance id was less than zero or greater than the final index */ #define TSK_ERR_PROVENANCE_OUT_OF_BOUNDS -209 /** A time value was non-finite (NaN counts as finite) */ #define TSK_ERR_TIME_NONFINITE -210 /** A genomic position was non-finite */ #define TSK_ERR_GENOME_COORDS_NONFINITE -211 /** One of the rows in the retained table refers to a row that has been deleted. */ #define TSK_ERR_KEEP_ROWS_MAP_TO_DELETED -212 /** A genomic position was less than zero or greater equal to the sequence length */ #define TSK_ERR_POSITION_OUT_OF_BOUNDS -213 /** @} */ /** @defgroup EDGE_ERROR_GROUP Edge errors. @{ */ /** A parent node of an edge was TSK_NULL. */ #define TSK_ERR_NULL_PARENT -300 /** A child node of an edge was TSK_NULL. */ #define TSK_ERR_NULL_CHILD -301 /** The edge table was not sorted by the time of each edge's parent nodes. Sort order is (time[parent], child, left). */ #define TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME -302 /** A parent node had edges that were non-contigious. */ #define TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS -303 /** The edge table was not sorted by the id of the child node of each edge. Sort order is (time[parent], child, left). */ #define TSK_ERR_EDGES_NOT_SORTED_CHILD -304 /** The edge table was not sorted by the left coordinate each edge. Sort order is (time[parent], child, left). */ #define TSK_ERR_EDGES_NOT_SORTED_LEFT -305 /** An edge had child node that was older than the parent. Parent times must be greater than the child time. */ #define TSK_ERR_BAD_NODE_TIME_ORDERING -306 /** An edge had a genomic interval where right was greater or equal to left. */ #define TSK_ERR_BAD_EDGE_INTERVAL -307 /** An edge was duplicated. */ #define TSK_ERR_DUPLICATE_EDGES -308 /** An edge had a right coord greater than the genomic length. */ #define TSK_ERR_RIGHT_GREATER_SEQ_LENGTH -309 /** An edge had a left coord less than zero. */ #define TSK_ERR_LEFT_LESS_ZERO -310 /** A parent node had edges that were contradictory over an interval. */ #define TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN -311 /** A method that doesn't support edge metadata was attempted on an edge table containing metadata. */ #define TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA -312 /** @} */ /** @defgroup SITE_ERROR_GROUP Site errors. @{ */ /** The site table was not in order of increasing genomic position. */ #define TSK_ERR_UNSORTED_SITES -400 /** The site table had more than one site at a single genomic position. */ #define TSK_ERR_DUPLICATE_SITE_POSITION -401 /** A site had a position that was less than zero or greater than the sequence length. */ #define TSK_ERR_BAD_SITE_POSITION -402 /** @} */ /** @defgroup MUTATION_ERROR_GROUP Mutation errors. @{ */ /** A mutation had a parent mutation that was at a different site. */ #define TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE -500 /** A mutation had a parent mutation that was itself. */ #define TSK_ERR_MUTATION_PARENT_EQUAL -501 /** A mutation had a parent mutation that had a greater id. */ #define TSK_ERR_MUTATION_PARENT_AFTER_CHILD -502 /** Two or more mutation parent references formed a loop */ #define TSK_ERR_MUTATION_PARENT_INCONSISTENT -503 /** The mutation table was not in the order of non-decreasing site id and non-increasing time within each site. */ #define TSK_ERR_UNSORTED_MUTATIONS -504 /* 505 was the now unused TSK_ERR_NON_SINGLE_CHAR_MUTATION */ /** A mutation's time was younger (not >=) the time of its node and wasn't TSK_UNKNOWN_TIME. */ #define TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE -506 /** A mutation's time was older (not <=) than the time of its parent mutation, and wasn't TSK_UNKNOWN_TIME. */ #define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION -507 /** A mutation's time was older (not <) than the time of the parent node of the edge on which it occurs, and wasn't TSK_UNKNOWN_TIME. */ #define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE -508 /** A single site had a mixture of known mutation times and TSK_UNKNOWN_TIME */ #define TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN -509 /** Some mutations have TSK_UNKNOWN_TIME in an algorithm where that's disallowed (use compute_mutation_times?). */ #define TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME -510 /** A mutation's parent was not consistent with the topology of the tree. */ #define TSK_ERR_BAD_MUTATION_PARENT -511 /** @} */ /** @defgroup MIGRATION_ERROR_GROUP Migration errors. @{ */ /** The migration table was not sorted by time. */ #define TSK_ERR_UNSORTED_MIGRATIONS -550 /** @} */ /** @defgroup SAMPLE_ERROR_GROUP Sample errors. @{ */ /** A duplicate sample was specified. */ #define TSK_ERR_DUPLICATE_SAMPLE -600 /** A sample id that was not valid was specified. */ #define TSK_ERR_BAD_SAMPLES -601 /** @} */ /** @defgroup TABLE_ERROR_GROUP Table errors. @{ */ /** An invalid table position was specifed. */ #define TSK_ERR_BAD_TABLE_POSITION -700 /** A sequence length equal to or less than zero was specified. */ #define TSK_ERR_BAD_SEQUENCE_LENGTH -701 /** The table collection was not indexed. */ #define TSK_ERR_TABLES_NOT_INDEXED -702 /** Tables cannot be larger than 2**31 rows. */ #define TSK_ERR_TABLE_OVERFLOW -703 /** Ragged array columns cannot be larger than 2**64 bytes. */ #define TSK_ERR_COLUMN_OVERFLOW -704 /** The table collection contains more than 2**31 trees. */ #define TSK_ERR_TREE_OVERFLOW -705 /** Metadata was attempted to be set on a table where it is disabled. */ #define TSK_ERR_METADATA_DISABLED -706 /** There was an error with the table's indexes. */ #define TSK_ERR_TABLES_BAD_INDEXES -707 /** @} */ /** @defgroup LIMITATION_ERROR_GROUP Limitation errors. @{ */ /** An operation was attempted that only supports infinite sites, i.e. at most a single mutation per site. */ #define TSK_ERR_ONLY_INFINITE_SITES -800 /** Simplification was attempted with migrations present, which are not supported. */ #define TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED -801 /** Sorting was attempted on migrations, which is not supported. */ #define TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED -802 /** An invalid sort offset was specified, for sites and mutations this must be either 0 or the table length. */ #define TSK_ERR_SORT_OFFSET_NOT_SUPPORTED -803 /** An operation was attempted that only supports binary mutations. */ #define TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED -804 /** An operation was attempted that doesn't support migrations, with a non-empty migration table. */ #define TSK_ERR_MIGRATIONS_NOT_SUPPORTED -805 /** A table attempted to extend from itself. */ #define TSK_ERR_CANNOT_EXTEND_FROM_SELF -806 /** An operation was attempted that doesn't support silent mutations, i.e. a mutation that doesn't change the allelic state. */ #define TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED -807 /** A copy of a variant cannot be decoded. */ #define TSK_ERR_VARIANT_CANT_DECODE_COPY -808 /** A tree sequence cannot take ownership of a table collection where TSK_NO_EDGE_METADATA. */ #define TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA -809 /** Operation is undefined for nonbinary trees */ #define TSK_ERR_UNDEFINED_NONBINARY -810 /** Operation is undefined for trees with multiple roots. */ #define TSK_ERR_UNDEFINED_MULTIROOT -811 /** @} */ /** @defgroup STATS_ERROR_GROUP Stats errors. @{ */ /** Zero windows were specified, at least one window must be specified. */ #define TSK_ERR_BAD_NUM_WINDOWS -900 /** The window specification was not an increasing list of positions between 0 and the sequence length. */ #define TSK_ERR_BAD_WINDOWS -901 /** More than one stat mode was specified. */ #define TSK_ERR_MULTIPLE_STAT_MODES -902 /** The state dimension was not >=1. */ #define TSK_ERR_BAD_STATE_DIMS -903 /** The result dimension was not >=1. */ #define TSK_ERR_BAD_RESULT_DIMS -904 /** Insufficient sample sets were provided. */ #define TSK_ERR_INSUFFICIENT_SAMPLE_SETS -905 /** Insufficient sample set index tuples were provided. */ #define TSK_ERR_INSUFFICIENT_INDEX_TUPLES -906 /** The sample set index was out of bounds. */ #define TSK_ERR_BAD_SAMPLE_SET_INDEX -907 /** The sample set index was empty. */ #define TSK_ERR_EMPTY_SAMPLE_SET -908 /** A stat mode was attempted that is not supported by the operation. */ #define TSK_ERR_UNSUPPORTED_STAT_MODE -909 /** Statistics based on branch lengths were attempted when the ``time_units`` were ``uncalibrated``. */ #define TSK_ERR_TIME_UNCALIBRATED -910 /** The TSK_STAT_POLARISED option was passed to a statistic that does not support it. */ #define TSK_ERR_STAT_POLARISED_UNSUPPORTED -911 /** The TSK_STAT_SPAN_NORMALISE option was passed to a statistic that does not support it. */ #define TSK_ERR_STAT_SPAN_NORMALISE_UNSUPPORTED -912 /** Insufficient weights were provided. */ #define TSK_ERR_INSUFFICIENT_WEIGHTS -913 /** The node bin map contains a value less than TSK_NULL. */ #define TSK_ERR_BAD_NODE_BIN_MAP -914 /** Maximum index in node bin map is greater than output dimension. */ #define TSK_ERR_BAD_NODE_BIN_MAP_DIM -915 /** The vector of quantiles is out of bounds or in nonascending order. */ #define TSK_ERR_BAD_QUANTILES -916 /** Times are not in ascending order */ #define TSK_ERR_UNSORTED_TIMES -917 /* The provided positions are not provided in strictly increasing order */ #define TSK_ERR_STAT_UNSORTED_POSITIONS -918 /** The provided positions are not unique */ #define TSK_ERR_STAT_DUPLICATE_POSITIONS -919 /** The provided sites are not provided in strictly increasing position order */ #define TSK_ERR_STAT_UNSORTED_SITES -920 /** The provided sites are not unique */ #define TSK_ERR_STAT_DUPLICATE_SITES -921 /** The number of time windows is zero */ #define TSK_ERR_BAD_TIME_WINDOWS_DIM -922 /** Sample times do not all equal the start of first time window */ #define TSK_ERR_BAD_SAMPLE_PAIR_TIMES -923 /** Time windows are not strictly increasing */ #define TSK_ERR_BAD_TIME_WINDOWS -924 /** Time windows do not end at infinity */ #define TSK_ERR_BAD_TIME_WINDOWS_END -925 /** Node time does not fall within assigned time window */ #define TSK_ERR_BAD_NODE_TIME_WINDOW -926 /** @} */ /** @defgroup MAPPING_ERROR_GROUP Mutation mapping errors. @{ */ /** Only missing genotypes were specified, at least one non-missing is required. */ #define TSK_ERR_GENOTYPES_ALL_MISSING -1000 /** A genotype value was greater than the maximum allowed (64) or less than TSK_MISSING_DATA (-1). */ #define TSK_ERR_BAD_GENOTYPE -1001 /** A ancestral genotype value was greater than the maximum allowed (64) or less than 0. */ #define TSK_ERR_BAD_ANCESTRAL_STATE -1002 /** @} */ /** @defgroup GENOTYPE_ERROR_GROUP Genotype decoding errors. @{ */ /** Genotypes were requested for non-samples at the same time as asking that isolated nodes be marked as missing. This is not supported. */ #define TSK_ERR_MUST_IMPUTE_NON_SAMPLES -1100 /** A user-specified allele map was used, but didn't contain an allele found in the tree sequence. */ #define TSK_ERR_ALLELE_NOT_FOUND -1101 /** More than 2147483647 alleles were specified. */ #define TSK_ERR_TOO_MANY_ALLELES -1102 /** A user-specified allele map was used, but it contained zero alleles. */ #define TSK_ERR_ZERO_ALLELES -1103 /** An allele used when decoding alignments had length other than one. */ #define TSK_ERR_BAD_ALLELE_LENGTH -1104 /** An allele used when decoding alignments matched the missing data character. */ #define TSK_ERR_MISSING_CHAR_COLLISION -1105 /** @} */ /** @defgroup DISTANCE_ERROR_GROUP Distance metric errors. @{ */ /** Trees with different numbers of samples were specified. */ #define TSK_ERR_SAMPLE_SIZE_MISMATCH -1200 /** Trees with nonidentical samples were specified. */ #define TSK_ERR_SAMPLES_NOT_EQUAL -1201 /** A tree with multiple roots was specified. */ #define TSK_ERR_MULTIPLE_ROOTS -1202 /** A tree with unary nodes was specified. */ #define TSK_ERR_UNARY_NODES -1203 /** Trees were specifed that had unequal sequence lengths. */ #define TSK_ERR_SEQUENCE_LENGTH_MISMATCH -1204 /** A tree was specifed that did not have the sample lists option enabled (TSK_SAMPLE_LISTS). */ #define TSK_ERR_NO_SAMPLE_LISTS -1205 /** @} */ /** @defgroup HAPLOTYPE_ERROR_GROUP Haplotype matching errors. @{ */ /** The Viterbi matrix has not filled (it has zero transitions). */ #define TSK_ERR_NULL_VITERBI_MATRIX -1300 /** There was no matching haplotype. */ #define TSK_ERR_MATCH_IMPOSSIBLE -1301 /** The compressed matrix has a node that has no samples in it's descendants. */ #define TSK_ERR_BAD_COMPRESSED_MATRIX_NODE -1302 /** There are too many values to compress. */ #define TSK_ERR_TOO_MANY_VALUES -1303 /** @} */ /** @defgroup UNION_ERROR_GROUP Union errors. @{ */ /** A node map was specified that contained a node not present in the specified table collection. */ #define TSK_ERR_UNION_BAD_MAP -1400 /** The shared portions of the specified tree sequences are not equal. Note that this may be the case if the table collections were not fully sorted before union was called. */ #define TSK_ERR_UNION_DIFF_HISTORIES -1401 /** @} */ /** @defgroup IBD_ERROR_GROUP IBD errors. @{ */ /** Both nodes in a sample pair are the same node. */ #define TSK_ERR_SAME_NODES_IN_PAIR -1500 /** Per-pair statistics were requested without TSK_IBD_STORE_PAIRS being specified. */ #define TSK_ERR_IBD_PAIRS_NOT_STORED -1501 /** Segments were requested without TSK_IBD_STORE_SEGMENTS being specified. */ #define TSK_ERR_IBD_SEGMENTS_NOT_STORED -1502 /** @} */ /** @defgroup SIMPLIFY_ERROR_GROUP Simplify errors. @{ */ /** Both TSK_SIMPLIFY_KEEP_UNARY and TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS were specified. Only one can be used. */ #define TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE -1600 /** @} */ /** @defgroup INDIVIDUAL_ERROR_GROUP Individual errors. @{ */ /** Individuals were provided in an order where parents were after their children. */ #define TSK_ERR_UNSORTED_INDIVIDUALS -1700 /** An individual was its own parent. */ #define TSK_ERR_INDIVIDUAL_SELF_PARENT -1701 /** An individual was its own ancestor in a cycle of references. */ #define TSK_ERR_INDIVIDUAL_PARENT_CYCLE -1702 /** An individual had nodes from more than one population (and only one was requested). */ #define TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH -1703 /** An individual had nodes from more than one time (and only one was requested). */ #define TSK_ERR_INDIVIDUAL_TIME_MISMATCH -1704 /** @} */ /** @defgroup EXTEND_EDGES_ERROR_GROUP Extend edges errors. @{ */ /** Maximum iteration number (max_iter) must be positive. */ #define TSK_ERR_EXTEND_EDGES_BAD_MAXITER -1800 /** @} */ // clang-format on /* This bit is 0 for any errors originating from kastore */ #define TSK_KAS_ERR_BIT 14 int tsk_set_kas_error(int err); bool tsk_is_kas_error(int err); int tsk_get_kas_error(int err); /** @brief Return a description of the specified error. The memory for the returned string is handled by the library and should not be freed by client code. @param err A tskit error code. @return A description of the error. */ const char *tsk_strerror(int err); /* Redefine this macro in downstream builds if stdout is not the * approriate stream to emit debug information when the TSK_DEBUG * flag is passed to supporting functions (e.g. in R). */ #define TSK_DEFAULT_DEBUG_STREAM stdout #ifdef TSK_TRACE_ERRORS static inline int _tsk_trace_error(int err, int line, const char *file) { fprintf(stderr, "tskit-trace-error: %d='%s' at line %d in %s\n", err, tsk_strerror(err), line, file); return err; } /* Developer note: this macro may be redefined as part of compilation for an R package, and should be treated as part of the documented API (no changes). */ #define tsk_trace_error(err) (_tsk_trace_error(err, __LINE__, __FILE__)) #else #define tsk_trace_error(err) (err) #endif #ifndef TSK_BUG_ASSERT_MESSAGE #define TSK_BUG_ASSERT_MESSAGE \ "If you are using tskit directly please open an issue on" \ " GitHub, ideally with a reproducible example." \ " (https://github.com/tskit-dev/tskit/issues) If you are" \ " using software that uses tskit, please report an issue" \ " to that software's issue tracker, at least initially." #endif /** We often wish to assert a condition that is unexpected, but using the normal `assert` means compiling without NDEBUG. This macro still asserts when NDEBUG is defined. If you are using this macro in your own software then please set TSK_BUG_ASSERT_MESSAGE to point users to your issue tracker. */ /* Developer note: this macro may redefined as part of compilation for an R package, and should be treated as part of the documented API (no changes). */ #define tsk_bug_assert(condition) \ do { \ if (!(condition)) { \ fprintf(stderr, "Bug detected in %s at line %d. %s\n", __FILE__, __LINE__, \ TSK_BUG_ASSERT_MESSAGE); \ abort(); \ } \ } while (0) void __tsk_safe_free(void **ptr); #define tsk_safe_free(pointer) __tsk_safe_free((void **) &(pointer)) #define TSK_MAX(a, b) ((a) > (b) ? (a) : (b)) #define TSK_MIN(a, b) ((a) < (b) ? (a) : (b)) /* This is a simple allocator that is optimised to efficiently allocate a * large number of small objects without large numbers of calls to malloc. * The allocator mallocs memory in chunks of a configurable size. When * responding to calls to get(), it will return a chunk of this memory. * This memory cannot be subsequently handed back to the allocator. However, * all memory allocated by the allocator can be returned at once by calling * reset. */ typedef struct { size_t chunk_size; /* number of bytes per chunk */ size_t top; /* the offset of the next available byte in the current chunk */ size_t current_chunk; /* the index of the chunk currently being used */ size_t total_size; /* the total number of bytes allocated + overhead. */ size_t total_allocated; /* the total number of bytes allocated. */ size_t num_chunks; /* the number of memory chunks. */ char **mem_chunks; /* the memory chunks */ } tsk_blkalloc_t; extern void tsk_blkalloc_print_state(tsk_blkalloc_t *self, FILE *out); extern int tsk_blkalloc_reset(tsk_blkalloc_t *self); extern int tsk_blkalloc_init(tsk_blkalloc_t *self, size_t chunk_size); extern void *tsk_blkalloc_get(tsk_blkalloc_t *self, size_t size); extern void tsk_blkalloc_free(tsk_blkalloc_t *self); typedef struct _tsk_avl_node_int_t { int64_t key; void *value; struct _tsk_avl_node_int_t *llink; struct _tsk_avl_node_int_t *rlink; /* This can only contain -1, 0, 1. We could set it to a smaller type, * but there's no point because of struct padding and alignment so * it's simplest to keep it as a plain int. */ int balance; } tsk_avl_node_int_t; typedef struct { tsk_avl_node_int_t head; tsk_size_t size; tsk_size_t height; } tsk_avl_tree_int_t; int tsk_avl_tree_int_init(tsk_avl_tree_int_t *self); int tsk_avl_tree_int_free(tsk_avl_tree_int_t *self); void tsk_avl_tree_int_print_state(tsk_avl_tree_int_t *self, FILE *out); int tsk_avl_tree_int_insert(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node); tsk_avl_node_int_t *tsk_avl_tree_int_search(const tsk_avl_tree_int_t *self, int64_t key); int tsk_avl_tree_int_ordered_nodes( const tsk_avl_tree_int_t *self, tsk_avl_node_int_t **out); tsk_avl_node_int_t *tsk_avl_tree_int_get_root(const tsk_avl_tree_int_t *self); tsk_size_t tsk_search_sorted(const double *array, tsk_size_t size, double value); double tsk_round(double x, unsigned int ndigits); /** @brief Check if a number is ``TSK_UNKNOWN_TIME`` @rst Unknown time values in tskit are represented by a particular NaN value. Since NaN values are not equal to each other by definition, a simple comparison like ``mutation.time == TSK_UNKNOWN_TIME`` will fail even if the mutation's time is TSK_UNKNOWN_TIME. This function compares the underlying bit representation of a double value and returns true iff it is equal to the specific NaN value :c:macro:`TSK_UNKNOWN_TIME`. @endrst @param val The number to check @return true if the number is ``TSK_UNKNOWN_TIME`` else false */ bool tsk_is_unknown_time(double val); /* We define local versions of isnan and isfinite to workaround some portability * issues. */ bool tsk_isnan(double val); bool tsk_isfinite(double val); #define TSK_UUID_SIZE 36 int tsk_generate_uuid(char *dest, int flags); /* TODO most of these can probably be macros so they compile out as no-ops. * Lets do the 64 bit tsk_size_t switch first though. */ void *tsk_malloc(tsk_size_t size); void *tsk_realloc(void *ptr, tsk_size_t size); void *tsk_calloc(tsk_size_t n, size_t size); void *tsk_memset(void *ptr, int fill, tsk_size_t size); void *tsk_memcpy(void *dest, const void *src, tsk_size_t size); void *tsk_memmove(void *dest, const void *src, tsk_size_t size); int tsk_memcmp(const void *s1, const void *s2, tsk_size_t size); /* Developer debug utilities. These are **not** threadsafe */ void tsk_set_debug_stream(FILE *f); FILE *tsk_get_debug_stream(void); /* Bit Array functionality */ // define a 32-bit chunk size for our bitsets. // this means we'll be able to hold 32 distinct items in each 32 bit uint #define TSK_BITSET_BITS ((tsk_size_t) 32) typedef uint32_t tsk_bitset_val_t; typedef struct { tsk_size_t row_len; // Number of size TSK_BITSET_BITS chunks per row tsk_size_t len; // Number of rows tsk_bitset_val_t *data; } tsk_bitset_t; int tsk_bitset_init(tsk_bitset_t *self, tsk_size_t num_bits, tsk_size_t length); void tsk_bitset_free(tsk_bitset_t *self); void tsk_bitset_intersect(const tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, tsk_size_t other_row, tsk_bitset_t *out); void tsk_bitset_subtract(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, tsk_size_t other_row); void tsk_bitset_union(tsk_bitset_t *self, tsk_size_t self_row, const tsk_bitset_t *other, tsk_size_t other_row); void tsk_bitset_set_bit(tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit); bool tsk_bitset_contains( const tsk_bitset_t *self, tsk_size_t row, const tsk_bitset_val_t bit); tsk_size_t tsk_bitset_count(const tsk_bitset_t *self, tsk_size_t row); void tsk_bitset_get_items( const tsk_bitset_t *self, tsk_size_t row, tsk_id_t *items, tsk_size_t *n_items); #ifdef __cplusplus } #endif #endif ================================================ FILE: c/tskit/genotypes.c ================================================ /* * MIT License * * Copyright (c) 2019-2025 Tskit Developers * Copyright (c) 2016-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include /* ======================================================== * * Variant generator * ======================================================== */ void tsk_variant_print_state(const tsk_variant_t *self, FILE *out) { tsk_size_t j; fprintf(out, "tsk_variant state\n"); fprintf(out, "user_alleles = %lld\n", (long long) self->user_alleles); fprintf(out, "num_alleles = %lld\n", (long long) self->num_alleles); for (j = 0; j < self->num_alleles; j++) { fprintf(out, "\tlen = %lld, '%.*s'\n", (long long) self->allele_lengths[j], (int) self->allele_lengths[j], self->alleles[j]); } fprintf(out, "num_samples = %lld\n", (long long) self->num_samples); } void tsk_vargen_print_state(const tsk_vargen_t *self, FILE *out) { tsk_variant_print_state(&self->variant, out); } /* Copy the fixed allele mapping specified by the user into local * memory. */ static int tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles) { int ret = 0; tsk_size_t j; size_t total_len, allele_len, offset; self->num_alleles = self->max_alleles; total_len = 0; for (j = 0; j < self->num_alleles; j++) { allele_len = strlen(alleles[j]); self->allele_lengths[j] = (tsk_size_t) allele_len; total_len += allele_len; } self->user_alleles_mem = tsk_malloc(total_len * sizeof(char *)); if (self->user_alleles_mem == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } offset = 0; for (j = 0; j < self->num_alleles; j++) { strcpy(self->user_alleles_mem + offset, alleles[j]); self->alleles[j] = self->user_alleles_mem + offset; offset += (size_t) self->allele_lengths[j]; } out: return ret; } static int variant_init_samples_and_index_map(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples, size_t num_samples_alloc, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_size_t j, num_nodes; tsk_id_t u; num_nodes = tsk_treeseq_get_num_nodes(tree_sequence); self->alt_samples = tsk_malloc(num_samples_alloc * sizeof(*samples)); self->alt_sample_index_map = tsk_malloc(num_nodes * sizeof(*self->alt_sample_index_map)); if (self->alt_samples == NULL || self->alt_sample_index_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memcpy(self->alt_samples, samples, num_samples * sizeof(*samples)); tsk_memset(self->alt_sample_index_map, 0xff, num_nodes * sizeof(*self->alt_sample_index_map)); /* Create the reverse mapping */ for (j = 0; j < num_samples; j++) { u = samples[j]; if (u < 0 || u >= (tsk_id_t) num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (self->alt_sample_index_map[u] != TSK_NULL) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } self->alt_sample_index_map[samples[j]] = (tsk_id_t) j; } out: return ret; } int tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles, tsk_flags_t options) { int ret = 0; tsk_size_t max_alleles_limit, max_alleles; tsk_size_t num_samples_alloc; tsk_memset(self, 0, sizeof(tsk_variant_t)); /* Set site id to NULL to indicate the variant is not decoded */ self->site.id = TSK_NULL; self->tree_sequence = tree_sequence; ret = tsk_tree_init( &self->tree, tree_sequence, samples == NULL ? TSK_SAMPLE_LISTS : 0); if (ret != 0) { goto out; } if (samples != NULL) { /* Take a copy of the samples so we don't have to manage the lifecycle*/ self->samples = tsk_malloc(num_samples * sizeof(*samples)); if (self->samples == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memcpy(self->samples, samples, num_samples * sizeof(*samples)); self->num_samples = num_samples; } self->options = options; max_alleles_limit = INT32_MAX; if (alleles == NULL) { self->user_alleles = false; max_alleles = 4; /* Arbitrary --- we'll rarely have more than this */ } else { self->user_alleles = true; /* Count the input alleles. The end is designated by the NULL sentinel. */ for (max_alleles = 0; alleles[max_alleles] != NULL; max_alleles++) ; if (max_alleles > max_alleles_limit) { ret = tsk_trace_error(TSK_ERR_TOO_MANY_ALLELES); goto out; } if (max_alleles == 0) { ret = tsk_trace_error(TSK_ERR_ZERO_ALLELES); goto out; } } self->max_alleles = max_alleles; self->alleles = tsk_calloc(max_alleles, sizeof(*self->alleles)); self->allele_lengths = tsk_malloc(max_alleles * sizeof(*self->allele_lengths)); if (self->alleles == NULL || self->allele_lengths == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (self->user_alleles) { ret = tsk_variant_copy_alleles(self, alleles); if (ret != 0) { goto out; } } if (self->samples == NULL) { self->num_samples = tsk_treeseq_get_num_samples(tree_sequence); self->samples = tsk_malloc(self->num_samples * sizeof(*self->samples)); if (self->samples == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memcpy(self->samples, tsk_treeseq_get_samples(tree_sequence), self->num_samples * sizeof(*self->samples)); self->sample_index_map = tsk_treeseq_get_sample_index_map(tree_sequence); num_samples_alloc = self->num_samples; } else { num_samples_alloc = self->num_samples; ret = variant_init_samples_and_index_map(self, tree_sequence, self->samples, self->num_samples, (size_t) num_samples_alloc, self->options); if (ret != 0) { goto out; } self->sample_index_map = self->alt_sample_index_map; } /* When a list of samples is given, we use the traversal based algorithm * which doesn't use sample list tracking in the tree */ if (self->alt_samples != NULL) { self->traversal_stack = tsk_malloc( tsk_treeseq_get_num_nodes(tree_sequence) * sizeof(*self->traversal_stack)); if (self->traversal_stack == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } } self->genotypes = tsk_malloc(num_samples_alloc * sizeof(*self->genotypes)); if (self->genotypes == NULL || self->alleles == NULL || self->allele_lengths == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } out: return ret; } int tsk_vargen_init(tsk_vargen_t *self, const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles, tsk_flags_t options) { int ret = 0; tsk_bug_assert(tree_sequence != NULL); tsk_memset(self, 0, sizeof(tsk_vargen_t)); self->tree_sequence = tree_sequence; ret = tsk_variant_init( &self->variant, tree_sequence, samples, num_samples, alleles, options); if (ret != 0) { goto out; } ret = 0; out: return ret; } int tsk_variant_free(tsk_variant_t *self) { if (self->tree_sequence != NULL) { tsk_tree_free(&self->tree); } tsk_safe_free(self->genotypes); tsk_safe_free(self->alleles); tsk_safe_free(self->allele_lengths); tsk_safe_free(self->user_alleles_mem); tsk_safe_free(self->samples); tsk_safe_free(self->alt_samples); tsk_safe_free(self->alt_sample_index_map); tsk_safe_free(self->traversal_stack); return 0; } int tsk_vargen_free(tsk_vargen_t *self) { tsk_variant_free(&self->variant); return 0; } static int tsk_variant_expand_alleles(tsk_variant_t *self) { int ret = 0; void *p; tsk_size_t hard_limit = INT32_MAX; if (self->max_alleles == hard_limit) { ret = tsk_trace_error(TSK_ERR_TOO_MANY_ALLELES); goto out; } self->max_alleles = TSK_MIN(hard_limit, self->max_alleles * 2); p = tsk_realloc(self->alleles, self->max_alleles * sizeof(*self->alleles)); if (p == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->alleles = p; p = tsk_realloc( self->allele_lengths, self->max_alleles * sizeof(*self->allele_lengths)); if (p == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->allele_lengths = p; out: return ret; } /* The following pair of functions are identical except one handles 8 bit * genotypes and the other handles 16 bit genotypes. This is done for performance * reasons as this is a key function and for common alleles can entail * iterating over millions of samples. The compiler hints are included for the * same reason. */ static int TSK_WARN_UNUSED tsk_variant_update_genotypes_sample_list( tsk_variant_t *self, tsk_id_t node, tsk_id_t derived) { int32_t *restrict genotypes = self->genotypes; const tsk_id_t *restrict list_left = self->tree.left_sample; const tsk_id_t *restrict list_right = self->tree.right_sample; const tsk_id_t *restrict list_next = self->tree.next_sample; tsk_id_t index, stop; int ret = 0; tsk_bug_assert(derived < INT32_MAX); index = list_left[node]; if (index != TSK_NULL) { stop = list_right[node]; while (true) { ret += genotypes[index] == TSK_MISSING_DATA; genotypes[index] = (int32_t) derived; if (index == stop) { break; } index = list_next[index]; } } return ret; } /* The following functions implement the genotype setting by traversing * down the tree to the samples. We're not so worried about performance here * because this should only be used when we have a very small number of samples, * and so we use a visit function to avoid duplicating code. */ typedef int (*visit_func_t)(tsk_variant_t *, tsk_id_t, tsk_id_t); static int TSK_WARN_UNUSED tsk_variant_traverse( tsk_variant_t *self, tsk_id_t node, tsk_id_t derived, visit_func_t visit) { int ret = 0; tsk_id_t *restrict stack = self->traversal_stack; const tsk_id_t *restrict left_child = self->tree.left_child; const tsk_id_t *restrict right_sib = self->tree.right_sib; const tsk_id_t *restrict sample_index_map = self->sample_index_map; tsk_id_t u, v, sample_index; int stack_top; int no_longer_missing = 0; stack_top = 0; stack[0] = node; while (stack_top >= 0) { u = stack[stack_top]; sample_index = sample_index_map[u]; if (sample_index != TSK_NULL) { ret = visit(self, sample_index, derived); if (ret < 0) { goto out; } no_longer_missing += ret; } stack_top--; for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { stack_top++; stack[stack_top] = v; } } ret = no_longer_missing; out: return ret; } static int tsk_variant_visit(tsk_variant_t *self, tsk_id_t sample_index, tsk_id_t derived) { int ret = 0; int32_t *restrict genotypes = self->genotypes; tsk_bug_assert(derived < INT32_MAX); tsk_bug_assert(sample_index != -1); ret = genotypes[sample_index] == TSK_MISSING_DATA; genotypes[sample_index] = (int32_t) derived; return ret; } static int TSK_WARN_UNUSED tsk_variant_update_genotypes_traversal( tsk_variant_t *self, tsk_id_t node, tsk_id_t derived) { return tsk_variant_traverse(self, node, derived, tsk_variant_visit); } static tsk_size_t tsk_variant_mark_missing(tsk_variant_t *self) { tsk_size_t num_missing = 0; const tsk_id_t *restrict left_child = self->tree.left_child; const tsk_id_t *restrict right_sib = self->tree.right_sib; const tsk_id_t *restrict sample_index_map = self->sample_index_map; const tsk_id_t N = self->tree.virtual_root; int32_t *restrict genotypes = self->genotypes; tsk_id_t root, sample_index; for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { if (left_child[root] == TSK_NULL) { sample_index = sample_index_map[root]; if (sample_index != TSK_NULL) { genotypes[sample_index] = TSK_MISSING_DATA; num_missing++; } } } return num_missing; } /* Mark missing for any requested node (sample or non-sample) that is isolated * in the current tree, i.e., has no parent and no children at this position. */ static tsk_size_t tsk_variant_mark_missing_any(tsk_variant_t *self) { tsk_size_t num_missing = 0; int32_t *restrict genotypes = self->genotypes; const tsk_id_t *restrict parent = self->tree.parent; const tsk_id_t *restrict left_child = self->tree.left_child; tsk_size_t j; for (j = 0; j < self->num_samples; j++) { tsk_id_t u = self->samples[j]; if (parent[u] == TSK_NULL && left_child[u] == TSK_NULL) { genotypes[j] = TSK_MISSING_DATA; num_missing++; } } return num_missing; } static tsk_id_t tsk_variant_get_allele_index(tsk_variant_t *self, const char *allele, tsk_size_t length) { tsk_id_t ret = -1; tsk_size_t j; for (j = 0; j < self->num_alleles; j++) { if (length == self->allele_lengths[j] && tsk_memcmp(allele, self->alleles[j], length) == 0) { ret = (tsk_id_t) j; break; } } return ret; } int tsk_variant_decode( tsk_variant_t *self, tsk_id_t site_id, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t allele_index; tsk_size_t j, num_missing; int no_longer_missing; tsk_mutation_t mutation; bool impute_missing = !!(self->options & TSK_ISOLATED_NOT_MISSING); bool by_traversal = self->alt_samples != NULL; int (*update_genotypes)(tsk_variant_t *, tsk_id_t, tsk_id_t); tsk_size_t (*mark_missing)(tsk_variant_t *); if (self->tree_sequence == NULL) { ret = tsk_trace_error(TSK_ERR_VARIANT_CANT_DECODE_COPY); goto out; } ret = tsk_treeseq_get_site(self->tree_sequence, site_id, &self->site); if (ret != 0) { goto out; } ret = tsk_tree_seek(&self->tree, self->site.position, 0); if (ret != 0) { goto out; } /* When we have no specified samples we need sample lists to be active * on the tree, as indicated by the presence of left_sample */ if (!by_traversal && self->tree.left_sample == NULL) { ret = tsk_trace_error(TSK_ERR_NO_SAMPLE_LISTS); goto out; } /* For now we use a traversal method to find genotypes when we have a * specified set of samples, but we should provide the option to do it * via tracked_samples in the tree also. There will be a tradeoff: if * we only have a small number of samples, it's probably better to * do it by traversal. For large sets of samples though, it may be * better to use the sample list infrastructure. */ mark_missing = tsk_variant_mark_missing; update_genotypes = tsk_variant_update_genotypes_sample_list; if (by_traversal) { update_genotypes = tsk_variant_update_genotypes_traversal; /* When decoding a user-provided list of nodes (which may include * non-samples), mark isolated nodes as missing directly by checking * isolation status for each requested node. */ mark_missing = tsk_variant_mark_missing_any; } if (self->user_alleles) { allele_index = tsk_variant_get_allele_index( self, self->site.ancestral_state, self->site.ancestral_state_length); if (allele_index == -1) { ret = tsk_trace_error(TSK_ERR_ALLELE_NOT_FOUND); goto out; } } else { /* Ancestral state is always allele 0 */ self->alleles[0] = self->site.ancestral_state; self->allele_lengths[0] = self->site.ancestral_state_length; self->num_alleles = 1; allele_index = 0; } /* The algorithm for generating the allelic state of every sample works by * examining each mutation in order, and setting the state for all the * samples under the mutation's node. For complex sites where there is * more than one mutation, we depend on the ordering of mutations being * correct. Specifically, any mutation that is above another mutation in * the tree must be visited first. This is enforced using the mutation.parent * field, where we require that a mutation's parent must appear before it * in the list of mutations. This guarantees the correctness of this algorithm. */ for (j = 0; j < self->num_samples; j++) { self->genotypes[j] = (int32_t) allele_index; } /* We mark missing data *before* updating the genotypes because * mutations directly over samples should not be missing */ num_missing = 0; if (!impute_missing) { num_missing = mark_missing(self); } for (j = 0; j < self->site.mutations_length; j++) { mutation = self->site.mutations[j]; /* Compute the allele index for this derived state value. */ allele_index = tsk_variant_get_allele_index( self, mutation.derived_state, mutation.derived_state_length); if (allele_index == -1) { if (self->user_alleles) { ret = tsk_trace_error(TSK_ERR_ALLELE_NOT_FOUND); goto out; } if (self->num_alleles == self->max_alleles) { ret = tsk_variant_expand_alleles(self); if (ret != 0) { goto out; } } allele_index = (tsk_id_t) self->num_alleles; self->alleles[allele_index] = mutation.derived_state; self->allele_lengths[allele_index] = mutation.derived_state_length; self->num_alleles++; } no_longer_missing = update_genotypes(self, mutation.node, allele_index); if (no_longer_missing < 0) { ret = no_longer_missing; goto out; } /* Update genotypes returns the number of missing values marked * not-missing */ num_missing -= (tsk_size_t) no_longer_missing; } self->has_missing_data = num_missing > 0; out: return ret; } int tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other) { int ret = 0; tsk_size_t total_len, offset, j; /* Copy everything */ tsk_memcpy(other, self, sizeof(*other)); /* Tree sequence left as NULL and zero'd tree is a way of indicating this variant is * fixed and cannot be further decoded. */ other->tree_sequence = NULL; tsk_memset(&other->tree, sizeof(other->tree), 0); other->traversal_stack = NULL; other->samples = NULL; other->sample_index_map = NULL; other->alt_samples = NULL; other->alt_sample_index_map = NULL; other->user_alleles_mem = NULL; total_len = 0; for (j = 0; j < self->num_alleles; j++) { total_len += self->allele_lengths[j]; } other->samples = tsk_malloc(other->num_samples * sizeof(*other->samples)); other->genotypes = tsk_malloc(other->num_samples * sizeof(*other->genotypes)); other->user_alleles_mem = tsk_malloc(total_len * sizeof(*other->user_alleles_mem)); other->allele_lengths = tsk_malloc(other->num_alleles * sizeof(*other->allele_lengths)); other->alleles = tsk_malloc(other->num_alleles * sizeof(*other->alleles)); if (other->samples == NULL || other->genotypes == NULL || other->user_alleles_mem == NULL || other->allele_lengths == NULL || other->alleles == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memcpy( other->samples, self->samples, other->num_samples * sizeof(*other->samples)); tsk_memcpy(other->genotypes, self->genotypes, other->num_samples * sizeof(*other->genotypes)); tsk_memcpy(other->allele_lengths, self->allele_lengths, other->num_alleles * sizeof(*other->allele_lengths)); offset = 0; for (j = 0; j < other->num_alleles; j++) { tsk_memcpy(other->user_alleles_mem + offset, self->alleles[j], other->allele_lengths[j] * sizeof(*other->user_alleles_mem)); other->alleles[j] = other->user_alleles_mem + offset; offset += other->allele_lengths[j]; } out: return ret; } int tsk_vargen_next(tsk_vargen_t *self, tsk_variant_t **variant) { int ret = 0; if ((tsk_size_t) self->site_index < tsk_treeseq_get_num_sites(self->tree_sequence)) { ret = tsk_variant_decode(&self->variant, self->site_index, 0); if (ret != 0) { goto out; } self->site_index++; *variant = &self->variant; ret = 1; } out: return ret; } static int tsk_treeseq_decode_alignments_overlay_missing(const tsk_treeseq_t *self, const tsk_id_t *nodes, tsk_size_t num_nodes, double left, double right, char missing_data_character, tsk_size_t L, char *alignments_out) { int ret = 0; tsk_tree_t tree; tsk_size_t i, seg_left, seg_right; char *row = NULL; tsk_id_t u; tsk_memset(&tree, 0, sizeof(tree)); ret = tsk_tree_init(&tree, self, 0); if (ret != 0) { goto out; } ret = tsk_tree_seek(&tree, left, 0); if (ret != 0) { goto out; } while (tree.index != -1 && tree.interval.left < right) { seg_left = TSK_MAX((tsk_size_t) tree.interval.left, (tsk_size_t) left); seg_right = TSK_MIN((tsk_size_t) tree.interval.right, (tsk_size_t) right); if (seg_right > seg_left) { for (i = 0; i < num_nodes; i++) { u = nodes[i]; if (tree.parent[u] == TSK_NULL && tree.left_child[u] == TSK_NULL) { row = alignments_out + i * L; /* memset takes an `int`, `missing_data_character` is a `char` which * can be signed or unsigned depending on the platform, so we need to * cast. Some tools/compilers will warn if we just cast * to `unsigned char` and leave the cast to `int` as implicit, hence * the double cast. */ tsk_memset(row + (seg_left - (tsk_size_t) left), (int) (unsigned char) missing_data_character, seg_right - seg_left); } } } ret = tsk_tree_next(&tree); if (ret < 0) { goto out; } } /* On success we should return 0, not TSK_TREE_OK from the last tsk_tree_next */ ret = 0; out: tsk_tree_free(&tree); return ret; } static int tsk_treeseq_decode_alignments_overlay_sites(const tsk_treeseq_t *self, const tsk_id_t *nodes, tsk_size_t num_nodes, double left, double right, char missing_data_character, tsk_size_t L, char *alignments_out, tsk_flags_t options) { int ret = 0; tsk_variant_t var; tsk_id_t site_id; tsk_site_t site; char *allele_byte = NULL; tsk_size_t allele_cap = 0; tsk_size_t i, j; char *row = NULL; int32_t g; char c; char *tmp = NULL; tsk_memset(&var, 0, sizeof(var)); ret = tsk_variant_init(&var, self, nodes, num_nodes, NULL, options); if (ret != 0) { goto out; } for (site_id = 0; site_id < (tsk_id_t) tsk_treeseq_get_num_sites(self); site_id++) { ret = tsk_treeseq_get_site(self, site_id, &site); if (ret != 0) { goto out; } if (site.position < left) { continue; } if (site.position >= right) { break; } ret = tsk_variant_decode(&var, site_id, 0); if (ret != 0) { goto out; } if (var.num_alleles > 0) { if (var.num_alleles > allele_cap) { tmp = tsk_realloc(allele_byte, var.num_alleles * sizeof(*allele_byte)); if (tmp == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } allele_byte = tmp; allele_cap = var.num_alleles; } for (j = 0; j < var.num_alleles; j++) { if (var.allele_lengths[j] != 1) { ret = tsk_trace_error(TSK_ERR_BAD_ALLELE_LENGTH); goto out; } allele_byte[j] = var.alleles[j][0]; if (allele_byte[j] == missing_data_character) { ret = tsk_trace_error(TSK_ERR_MISSING_CHAR_COLLISION); goto out; } } for (i = 0; i < num_nodes; i++) { row = alignments_out + i * L; g = var.genotypes[i]; c = missing_data_character; if (g != TSK_MISSING_DATA) { tsk_bug_assert(g >= 0); tsk_bug_assert((tsk_size_t) g < var.num_alleles); c = allele_byte[g]; } row[((tsk_size_t) site.position) - (tsk_size_t) left] = (char) c; } } } out: tsk_safe_free(allele_byte); tsk_variant_free(&var); return ret; } /* NOTE: We usually keep functions with a tsk_treeseq_t signature in trees.c. * tsk_treeseq_decode_alignments is implemented here instead because it * depends directly on tsk_variant_t and the genotype/allele machinery in * this file (and thus on genotypes.h). This slightly breaks that layering * convention but keeps the implementation close to the variant code. */ int tsk_treeseq_decode_alignments(const tsk_treeseq_t *self, const char *ref_seq, tsk_size_t ref_seq_length, const tsk_id_t *nodes, tsk_size_t num_nodes, double left, double right, char missing_data_character, char *alignments_out, tsk_flags_t options) { int ret = 0; tsk_size_t i, L; char *row = NULL; if (!tsk_treeseq_get_discrete_genome(self)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if (ref_seq == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if (ref_seq_length != (tsk_size_t) tsk_treeseq_get_sequence_length(self)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if (trunc(left) != left || trunc(right) != right) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if (left < 0 || right > tsk_treeseq_get_sequence_length(self) || (tsk_size_t) left >= (tsk_size_t) right) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } L = (tsk_size_t) right - (tsk_size_t) left; if (num_nodes == 0) { return 0; } if (nodes == NULL || alignments_out == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } for (i = 0; i < num_nodes; i++) { if (nodes[i] < 0 || nodes[i] >= (tsk_id_t) tsk_treeseq_get_num_nodes(self)) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } } /* Fill rows with the reference slice */ for (i = 0; i < num_nodes; i++) { row = alignments_out + i * L; tsk_memcpy(row, ref_seq + (tsk_size_t) left, L); } if (!(options & TSK_ISOLATED_NOT_MISSING)) { ret = tsk_treeseq_decode_alignments_overlay_missing(self, nodes, num_nodes, left, right, missing_data_character, L, alignments_out); if (ret != 0) { goto out; } } ret = tsk_treeseq_decode_alignments_overlay_sites(self, nodes, num_nodes, left, right, missing_data_character, L, alignments_out, options); if (ret != 0) { goto out; } out: return ret; } ================================================ FILE: c/tskit/genotypes.h ================================================ /* * MIT License * * Copyright (c) 2019-2022 Tskit Developers * Copyright (c) 2016-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef TSK_GENOTYPES_H #define TSK_GENOTYPES_H #ifdef __cplusplus extern "C" { #endif #include #define TSK_ISOLATED_NOT_MISSING (1 << 1) /** @brief A variant at a specific site. @rst Used to generate the genotypes for a given set of samples at a given site. @endrst */ typedef struct { /** @brief Unowned reference to the tree sequence of the variant */ const tsk_treeseq_t *tree_sequence; /** @brief The site this variant is currently decoded at*/ tsk_site_t site; tsk_tree_t tree; /** @brief Array of allele strings that the genotypes of the variant refer to * These are not NULL terminated - use `allele_lengths` for example:. * `printf("%.*s", (int) var->allele_lengths[j], var->alleles[j]);` */ const char **alleles; /** @brief Lengths of the allele strings */ tsk_size_t *allele_lengths; /** @brief Length of the allele array */ tsk_size_t num_alleles; tsk_size_t max_alleles; /** @brief If True the genotypes of isolated nodes have been decoded to the "missing" * genotype. If False they are set to the ancestral state (in the absence of * mutations above them)*/ bool has_missing_data; /** @brief Array of genotypes for the current site */ int32_t *genotypes; /** @brief Number of samples */ tsk_size_t num_samples; /** @brief Array of sample ids used*/ tsk_id_t *samples; const tsk_id_t *sample_index_map; bool user_alleles; char *user_alleles_mem; tsk_id_t *traversal_stack; tsk_flags_t options; tsk_id_t *alt_samples; tsk_id_t *alt_sample_index_map; } tsk_variant_t; /* All vargen related structs and methods were deprecated in C API v1.0 */ typedef struct { const tsk_treeseq_t *tree_sequence; tsk_id_t site_index; tsk_variant_t variant; } tsk_vargen_t; /** @defgroup VARIANT_API_GROUP Variant API for obtaining genotypes. @{ */ /** @brief Initialises the variant by allocating the internal memory @rst This must be called before any operations are performed on the variant. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. @endrst @param self A pointer to an uninitialised tsk_variant_t object. @param tree_sequence A pointer to the tree sequence from which this variant will decode genotypes. No copy is taken, so this tree sequence must persist for the lifetime of the variant. @param samples Optional. Either `NULL` or an array of node ids of the samples that are to have their genotypes decoded. A copy of this array will be taken by the variant. If `NULL` then the samples from the tree sequence will be used. @param num_samples The number of ids in the samples array, ignored if `samples` is `NULL` @param alleles Optional. Either ``NULL`` or an array of string alleles with a terminal ``NULL`` sentinel value. If specified, the genotypes will be decoded to match the index in this allele array. If ``NULL`` then alleles will be automatically determined from the mutations encountered. @param options Variant options. Either ``0`` or ``TSK_ISOLATED_NOT_MISSING`` which if specified indicates that isolated sample nodes should not be decoded as the "missing" state but as the ancestral state (or the state of any mutation above them). @return Return 0 on success or a negative value on failure. */ int tsk_variant_init(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles, tsk_flags_t options); /** @brief Copies the state of this variant to another variant @rst Copies the site, genotypes and alleles from this variant to another. Note that the other variant should be uninitialised as this method does not free any memory that the other variant owns. After copying `other` is frozen and this restricts it from being further decoded at any site. `self` remains unchanged. @endrst @param self A pointer to an initialised and decoded tsk_variant_t object. @param other A pointer to an uninitialised tsk_variant_t object. @return Return 0 on success or a negative value on failure. */ int tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other); /** @brief Decode the genotypes at the given site, storing them in this variant. @rst Decodes the genotypes for this variant's samples, indexed to this variant's alleles, at the specified site. This method is most efficient at decoding sites in-order, either forwards or backwards along the tree sequence. Resulting genotypes are stored in the ``genotypes`` member of this variant. @endrst @param self A pointer to an initialised tsk_variant_t object. @param site_id A valid site id for the tree sequence of this variant. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of `tskit`. @return Return 0 on success or a negative value on failure. */ int tsk_variant_decode(tsk_variant_t *self, tsk_id_t site_id, tsk_flags_t options); /** @brief Free the internal memory for the specified variant. @param self A pointer to an initialised tsk_variant_t object. @return Always returns 0. */ int tsk_variant_free(tsk_variant_t *self); /** @brief Print out the state of this variant to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_variant_t object. @param out The stream to write the summary to. */ void tsk_variant_print_state(const tsk_variant_t *self, FILE *out); /** @} */ /* Deprecated vargen methods (since C API v1.0) */ int tsk_vargen_init(tsk_vargen_t *self, const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples, const char **alleles, tsk_flags_t options); int tsk_vargen_next(tsk_vargen_t *self, tsk_variant_t **variant); int tsk_vargen_free(tsk_vargen_t *self); void tsk_vargen_print_state(const tsk_vargen_t *self, FILE *out); #ifdef __cplusplus } #endif #endif ================================================ FILE: c/tskit/haplotype_matching.c ================================================ /* * MIT License * * Copyright (c) 2019-2025 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #define MAX_PARSIMONY_WORDS 256 const char *_zero_one_alleles[] = { "0", "1", NULL }; const char *_acgt_alleles[] = { "A", "C", "G", "T", NULL }; static int cmp_double(const void *a, const void *b) { const double *ia = (const double *) a; const double *ib = (const double *) b; return (*ia > *ib) - (*ia < *ib); } static int cmp_argsort(const void *a, const void *b) { const tsk_argsort_t *ia = (const tsk_argsort_t *) a; const tsk_argsort_t *ib = (const tsk_argsort_t *) b; int ret = (ia->value > ib->value) - (ia->value < ib->value); /* Break any ties using the index to ensure consistency */ if (ret == 0) { ret = (ia->index > ib->index) - (ia->index < ib->index); } return ret; } static void tsk_ls_hmm_check_state(tsk_ls_hmm_t *self) { tsk_id_t *T_index = self->transition_index; tsk_value_transition_t *T = self->transitions; tsk_id_t j; for (j = 0; j < (tsk_id_t) self->num_transitions; j++) { if (T[j].tree_node != TSK_NULL) { tsk_bug_assert(T_index[T[j].tree_node] == j); } } /* tsk_bug_assert(self->num_transitions <= self->num_samples); */ if (self->num_transitions > 0) { for (j = 0; j < (tsk_id_t) self->num_nodes; j++) { if (T_index[j] != TSK_NULL) { tsk_bug_assert(T[T_index[j]].tree_node == j); } tsk_bug_assert(self->tree.parent[j] == self->parent[j]); } } } void tsk_ls_hmm_print_state(tsk_ls_hmm_t *self, FILE *out) { tsk_size_t j, l; fprintf(out, "tree_sequence = %p\n", (void *) self->tree_sequence); fprintf(out, "num_sites = %lld\n", (long long) self->num_sites); fprintf(out, "num_samples = %lld\n", (long long) self->num_samples); fprintf(out, "num_values = %lld\n", (long long) self->num_values); fprintf(out, "max_values = %lld\n", (long long) self->max_values); fprintf(out, "num_optimal_value_set_words = %lld\n", (long long) self->num_optimal_value_set_words); fprintf(out, "sites::\n"); for (l = 0; l < self->num_sites; l++) { fprintf(out, "%lld\t%lld\t[", (long long) l, (long long) self->num_alleles[l]); for (j = 0; j < self->num_alleles[l]; j++) { fprintf(out, "%s,", self->alleles[l][j]); } fprintf(out, "]\n"); } fprintf(out, "transitions::%lld\n", (long long) self->num_transitions); for (j = 0; j < self->num_transitions; j++) { fprintf(out, "tree_node=%lld\tvalue=%.14f\tvalue_index=%lld\n", (long long) self->transitions[j].tree_node, self->transitions[j].value, (long long) self->transitions[j].value_index); } if (self->num_transitions > 0) { fprintf(out, "tree::%lld\n", (long long) self->num_nodes); for (j = 0; j < self->num_nodes; j++) { fprintf(out, "%lld\tparent=%lld\ttransition=%lld\n", (long long) j, (long long) self->parent[j], (long long) self->transition_index[j]); } } tsk_ls_hmm_check_state(self); } int TSK_WARN_UNUSED tsk_ls_hmm_init(tsk_ls_hmm_t *self, tsk_treeseq_t *tree_sequence, double *recombination_rate, double *mutation_rate, tsk_flags_t options) { int ret = TSK_ERR_GENERIC; tsk_size_t l; tsk_memset(self, 0, sizeof(tsk_ls_hmm_t)); self->tree_sequence = tree_sequence; self->precision = 6; /* Seems like a safe value, but probably not ideal for perf */ self->num_sites = tsk_treeseq_get_num_sites(tree_sequence); self->num_samples = tsk_treeseq_get_num_samples(tree_sequence); self->num_alleles = tsk_malloc(self->num_sites * sizeof(*self->num_alleles)); self->num_nodes = tsk_treeseq_get_num_nodes(tree_sequence); self->parent = tsk_malloc(self->num_nodes * sizeof(*self->parent)); self->allelic_state = tsk_malloc(self->num_nodes * sizeof(*self->allelic_state)); self->transition_index = tsk_malloc(self->num_nodes * sizeof(*self->transition_index)); self->transition_stack = tsk_malloc(self->num_nodes * sizeof(*self->transition_stack)); /* We can't have more than 2 * num_samples transitions, so we use this as the * upper bound. Because of the implementation, we'll also have to worry about * the extra mutations at the first site, which in worst case involves all * mutations. We can definitely save some memory here if we want to.*/ self->max_transitions = 2 * self->num_samples + tsk_treeseq_get_num_mutations(tree_sequence); /* FIXME Arbitrarily doubling this after hitting problems */ self->max_transitions *= 2; self->transitions = tsk_malloc(self->max_transitions * sizeof(*self->transitions)); self->transitions_copy = tsk_malloc(self->max_transitions * sizeof(*self->transitions)); self->num_transition_samples = tsk_malloc(self->max_transitions * sizeof(*self->num_transition_samples)); self->transition_parent = tsk_malloc(self->max_transitions * sizeof(*self->transition_parent)); self->transition_time_order = tsk_malloc(self->max_transitions * sizeof(*self->transition_time_order)); self->values = tsk_malloc(self->max_transitions * sizeof(*self->values)); self->recombination_rate = tsk_malloc(self->num_sites * sizeof(*self->recombination_rate)); self->mutation_rate = tsk_malloc(self->num_sites * sizeof(*self->mutation_rate)); self->alleles = tsk_calloc(self->num_sites, sizeof(*self->alleles)); if (self->num_alleles == NULL || self->parent == NULL || self->allelic_state == NULL || self->transition_index == NULL || self->transition_stack == NULL || self->transitions == NULL || self->transitions_copy == NULL || self->num_transition_samples == NULL || self->transition_parent == NULL || self->transition_time_order == NULL || self->values == NULL || self->recombination_rate == NULL || self->mutation_rate == NULL || self->alleles == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (l = 0; l < self->num_sites; l++) { /* TODO check these inputs */ self->recombination_rate[l] = recombination_rate[l]; self->mutation_rate[l] = mutation_rate[l]; if (options & TSK_ALLELES_ACGT) { self->num_alleles[l] = 4; self->alleles[l] = _acgt_alleles; } else { /* Default to the 0/1 alleles */ self->num_alleles[l] = 2; self->alleles[l] = _zero_one_alleles; } } ret = tsk_tree_init(&self->tree, self->tree_sequence, 0); if (ret != 0) { goto out; } self->num_values = 0; self->max_values = 0; /* Keep this as a struct variable so that we can test overflow, but this * should never be set to more than MAX_PARSIMONY_WORDS as we're doing * a bunch of stack allocations based on this. */ self->max_parsimony_words = MAX_PARSIMONY_WORDS; ret = 0; out: return ret; } int tsk_ls_hmm_set_precision(tsk_ls_hmm_t *self, unsigned int precision) { self->precision = precision; return 0; } int tsk_ls_hmm_free(tsk_ls_hmm_t *self) { tsk_tree_free(&self->tree); tsk_safe_free(self->recombination_rate); tsk_safe_free(self->mutation_rate); tsk_safe_free(self->recombination_rate); tsk_safe_free(self->alleles); tsk_safe_free(self->num_alleles); tsk_safe_free(self->parent); tsk_safe_free(self->allelic_state); tsk_safe_free(self->transition_index); tsk_safe_free(self->transition_stack); tsk_safe_free(self->transitions); tsk_safe_free(self->transitions_copy); tsk_safe_free(self->transition_time_order); tsk_safe_free(self->values); tsk_safe_free(self->num_transition_samples); tsk_safe_free(self->transition_parent); tsk_safe_free(self->optimal_value_sets); return 0; } static int tsk_ls_hmm_reset(tsk_ls_hmm_t *self, double value) { int ret = 0; tsk_size_t j; tsk_id_t u; const tsk_id_t *samples; tsk_size_t N = self->num_nodes; tsk_memset(self->parent, 0xff, N * sizeof(*self->parent)); tsk_memset(self->transition_index, 0xff, N * sizeof(*self->transition_index)); tsk_memset(self->allelic_state, 0xff, N * sizeof(*self->allelic_state)); tsk_memset(self->transitions, 0, self->max_transitions * sizeof(*self->transitions)); tsk_memset(self->num_transition_samples, 0, self->max_transitions * sizeof(*self->num_transition_samples)); tsk_memset(self->transition_parent, 0xff, self->max_transitions * sizeof(*self->transition_parent)); samples = tsk_treeseq_get_samples(self->tree_sequence); for (j = 0; j < self->num_samples; j++) { u = samples[j]; self->transitions[j].tree_node = u; self->transitions[j].value = value; self->transition_index[u] = (tsk_id_t) j; } self->num_transitions = self->num_samples; return ret; } /* After we have moved on to a new tree we can have transitions still associated * with the old roots, which are now disconnected. Remove. */ static int tsk_ls_hmm_remove_dead_roots(tsk_ls_hmm_t *self) { tsk_id_t *restrict T_index = self->transition_index; tsk_value_transition_t *restrict T = self->transitions; const tsk_id_t *restrict right_sib = self->tree.right_sib; const tsk_id_t left_root = tsk_tree_get_left_root(&self->tree); const tsk_id_t *restrict parent = self->parent; tsk_id_t root, u; tsk_size_t j; const tsk_id_t root_marker = -2; for (root = left_root; root != TSK_NULL; root = right_sib[root]) { if (T_index[root] != TSK_NULL) { /* Use the value_index slot as a marker. We don't use this between * iterations, so it's safe to appropriate here */ T[T_index[root]].value_index = root_marker; } } for (j = 0; j < self->num_transitions; j++) { u = T[j].tree_node; if (u != TSK_NULL) { if (parent[u] == TSK_NULL && T[j].value_index != root_marker) { T_index[u] = TSK_NULL; T[j].tree_node = TSK_NULL; } T[j].value_index = -1; } } return 0; } static int tsk_ls_hmm_update_tree(tsk_ls_hmm_t *self, int direction) { int ret = 0; tsk_id_t *restrict parent = self->parent; tsk_id_t *restrict T_index = self->transition_index; const tsk_id_t *restrict edges_child = self->tree_sequence->tables->edges.child; const tsk_id_t *restrict edges_parent = self->tree_sequence->tables->edges.parent; tsk_value_transition_t *restrict T = self->transitions; tsk_id_t u, c, p, j, e; tsk_value_transition_t *vt; tsk_tree_position_t tree_pos; tree_pos = self->tree.tree_pos; for (j = tree_pos.out.start; j != tree_pos.out.stop; j += direction) { e = tree_pos.out.order[j]; c = edges_child[e]; u = c; if (T_index[u] == TSK_NULL) { /* Ensure the subtree we're detaching has a transition at the root */ while (T_index[u] == TSK_NULL) { u = parent[u]; tsk_bug_assert(u != TSK_NULL); } tsk_bug_assert(self->num_transitions < self->max_transitions); T_index[c] = (tsk_id_t) self->num_transitions; T[self->num_transitions].tree_node = c; T[self->num_transitions].value = T[T_index[u]].value; self->num_transitions++; } parent[c] = TSK_NULL; } for (j = tree_pos.in.start; j != tree_pos.in.stop; j += direction) { e = tree_pos.in.order[j]; c = edges_child[e]; p = edges_parent[e]; parent[c] = p; u = p; if (parent[p] == TSK_NULL) { /* Grafting onto a new root. */ if (T_index[p] == TSK_NULL) { T_index[p] = (tsk_id_t) self->num_transitions; tsk_bug_assert(self->num_transitions < self->max_transitions); T[self->num_transitions].tree_node = p; T[self->num_transitions].value = T[T_index[c]].value; self->num_transitions++; } } else { /* Grafting into an existing subtree. */ while (T_index[u] == TSK_NULL) { u = parent[u]; } tsk_bug_assert(u != TSK_NULL); } tsk_bug_assert(T_index[u] != -1 && T_index[c] != -1); if (T[T_index[u]].value == T[T_index[c]].value) { vt = &T[T_index[c]]; /* Mark the value transition as unusued */ vt->value = -1; vt->tree_node = TSK_NULL; T_index[c] = TSK_NULL; } } ret = tsk_ls_hmm_remove_dead_roots(self); return ret; } static int tsk_ls_hmm_get_allele_index(tsk_ls_hmm_t *self, tsk_id_t site, const char *allele_state, const tsk_size_t allele_length) { /* Note we're not doing tsk_trace_error here because it would require changing * the logic of the function. Could be done easily enough, though */ int ret = TSK_ERR_ALLELE_NOT_FOUND; const char **alleles = self->alleles[site]; const tsk_id_t num_alleles = (tsk_id_t) self->num_alleles[site]; tsk_id_t j; for (j = 0; j < num_alleles; j++) { if (strlen(alleles[j]) != allele_length) { break; } if (strncmp(alleles[j], allele_state, (size_t) allele_length) == 0) { ret = (int) j; break; } } return ret; } static int tsk_ls_hmm_update_probabilities( tsk_ls_hmm_t *self, const tsk_site_t *site, int32_t haplotype_state) { int ret = 0; tsk_id_t root; tsk_tree_t *tree = &self->tree; tsk_id_t *restrict parent = self->parent; tsk_id_t *restrict T_index = self->transition_index; tsk_value_transition_t *restrict T = self->transitions; int32_t *restrict allelic_state = self->allelic_state; const tsk_id_t left_root = tsk_tree_get_left_root(tree); tsk_mutation_t mut; tsk_id_t j, u, v; double x; bool match; /* Set the allelic states */ ret = tsk_ls_hmm_get_allele_index( self, site->id, site->ancestral_state, site->ancestral_state_length); if (ret < 0) { goto out; } for (root = left_root; root != TSK_NULL; root = tree->right_sib[root]) { allelic_state[root] = (int32_t) ret; } for (j = 0; j < (tsk_id_t) site->mutations_length; j++) { mut = site->mutations[j]; ret = tsk_ls_hmm_get_allele_index( self, site->id, mut.derived_state, mut.derived_state_length); if (ret < 0) { goto out; } u = mut.node; allelic_state[u] = (int32_t) ret; if (T_index[u] == TSK_NULL) { while (T_index[u] == TSK_NULL) { u = parent[u]; } tsk_bug_assert(self->num_transitions < self->max_transitions); T_index[mut.node] = (tsk_id_t) self->num_transitions; T[self->num_transitions].tree_node = mut.node; T[self->num_transitions].value = T[T_index[u]].value; self->num_transitions++; } } for (j = 0; j < (tsk_id_t) self->num_transitions; j++) { u = T[j].tree_node; if (u != TSK_NULL) { /* Get the allelic_state at u. */ v = u; while (allelic_state[v] == TSK_MISSING_DATA) { v = parent[v]; tsk_bug_assert(v != -1); } match = haplotype_state == TSK_MISSING_DATA || haplotype_state == allelic_state[v]; ret = self->next_probability(self, site->id, T[j].value, match, u, &x); if (ret != 0) { goto out; } T[j].value = x; } } /* Unset the allelic states */ for (root = left_root; root != TSK_NULL; root = tree->right_sib[root]) { allelic_state[root] = TSK_MISSING_DATA; } for (j = 0; j < (tsk_id_t) site->mutations_length; j++) { mut = site->mutations[j]; allelic_state[mut.node] = TSK_MISSING_DATA; } ret = 0; out: return ret; } static int tsk_ls_hmm_discretise_values(tsk_ls_hmm_t *self) { int ret = 0; tsk_value_transition_t *T = self->transitions; double *values = self->values; tsk_size_t j, k, num_values; num_values = 0; for (j = 0; j < self->num_transitions; j++) { if (T[j].tree_node != TSK_NULL) { values[num_values] = T[j].value; num_values++; } } tsk_bug_assert(num_values > 0); qsort(values, (size_t) num_values, sizeof(double), cmp_double); k = 0; for (j = 1; j < num_values; j++) { if (values[j] != values[k]) { k++; values[k] = values[j]; } } num_values = k + 1; self->num_values = num_values; for (j = 0; j < self->num_transitions; j++) { if (T[j].tree_node != TSK_NULL) { T[j].value_index = (tsk_id_t) tsk_search_sorted(values, num_values, T[j].value); tsk_bug_assert(T[j].value == self->values[T[j].value_index]); } } return ret; } /* * TODO We also have these function in tree.c where they're used in the * parsimony calculations (which are slightly different). It would be good to bring * these together, or at least avoid having the same function in two * files. Keeping it as it is for now so that it can be inlined, since * it's perf-sensitive. */ static inline tsk_id_t get_smallest_set_bit(uint64_t v) { /* This is an inefficient implementation, there are several better * approaches. On GCC we can use * return (uint8_t) (__builtin_ffsll((long long) v) - 1); */ uint64_t t = 1; tsk_id_t r = 0; assert(v != 0); while ((v & t) == 0) { t <<= 1; r++; } return r; } static inline uint64_t set_bit(uint64_t value, uint8_t bit) { return value | (1ULL << bit); } static inline bool bit_is_set(uint64_t value, uint8_t bit) { return (value & (1ULL << bit)) != 0; } static inline tsk_id_t get_smallest_element(const uint64_t *restrict A, tsk_size_t u, tsk_size_t num_words) { tsk_size_t base = u * num_words; const uint64_t *restrict a = A + base; tsk_id_t j = 0; while (a[j] == 0) { j++; tsk_bug_assert(j < (tsk_id_t) num_words); } return j * 64 + get_smallest_set_bit(a[j]); } /* static variables are zero-initialised by default. */ static const uint64_t zero_block[MAX_PARSIMONY_WORDS]; static inline bool all_zero(const uint64_t *restrict A, tsk_id_t u, tsk_size_t num_words) { if (num_words == 1) { return A[u] == 0; } else { return tsk_memcmp( zero_block, A + (tsk_size_t) u * num_words, num_words * sizeof(*A)) == 0; } } static inline bool element_in( const uint64_t *restrict A, tsk_id_t u, const tsk_id_t state, tsk_size_t num_words) { tsk_size_t index = ((tsk_size_t) u) * num_words + (tsk_size_t) (state / 64); return (A[index] & (1ULL << (state % 64))) != 0; } static inline void set_optimal_value( uint64_t *restrict A, tsk_id_t u, const tsk_size_t num_words, tsk_id_t state) { tsk_size_t index = ((tsk_size_t) u) * num_words + (tsk_size_t) (state / 64); tsk_bug_assert(((tsk_size_t) state) / 64 < num_words); A[index] |= 1ULL << (state % 64); } /* TODO the implementation here isn't particularly optimal and the way things * were organised was really driven by the old Fitch parsimony algorithm * (which only worked on binary trees. In particular, we should be working * word-by-word where possible rather than iterating by values like we do here. * Needs to be reworked when we're documenting/writing up this algorithm. */ static void compute_optimal_value_1(uint64_t *restrict A, const tsk_id_t *restrict left_child, const tsk_id_t *restrict right_sib, const tsk_id_t u, const tsk_id_t parent_state, const tsk_size_t num_values) { tsk_id_t v; uint64_t child; tsk_size_t value_count[64], max_value_count; uint8_t j; assert(num_values < 64); tsk_memset(value_count, 0, num_values * sizeof(*value_count)); for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { child = A[v]; /* If the set for a given child is empty, then we know it inherits * directly from the parent state and must be a singleton set. */ if (child == 0) { child = 1ULL << parent_state; } for (j = 0; j < num_values; j++) { value_count[j] += bit_is_set(child, j); } } max_value_count = 0; for (j = 0; j < num_values; j++) { max_value_count = TSK_MAX(max_value_count, value_count[j]); } A[u] = 0; for (j = 0; j < num_values; j++) { if (value_count[j] == max_value_count) { A[u] = set_bit(A[u], j); } } } static void compute_optimal_value_general(uint64_t *restrict A, const tsk_id_t *restrict left_child, const tsk_id_t *restrict right_sib, const tsk_id_t u, const tsk_id_t parent_state, const tsk_size_t num_values, const tsk_size_t num_words) { tsk_id_t v; uint64_t child[MAX_PARSIMONY_WORDS]; uint64_t *Au; tsk_size_t base, word, bit; bool child_all_zero; const tsk_id_t state_index = parent_state / 64; const uint64_t state_word = 1ULL << (parent_state % 64); tsk_size_t value_count[64 * MAX_PARSIMONY_WORDS], max_value_count; tsk_size_t j; tsk_bug_assert(num_values < 64 * MAX_PARSIMONY_WORDS); tsk_bug_assert(num_words <= MAX_PARSIMONY_WORDS); for (j = 0; j < num_values; j++) { value_count[j] = 0; } for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { child_all_zero = true; base = ((tsk_size_t) v) * num_words; for (word = 0; word < num_words; word++) { child[word] = A[base + word]; child_all_zero = child_all_zero && (child[word] == 0); } /* If the set for a given child is empty, then we know it inherits * directly from the parent state and must be a singleton set. */ if (child_all_zero) { child[state_index] = state_word; } for (j = 0; j < num_values; j++) { word = j / 64; bit = j % 64; assert(word < num_words); value_count[j] += bit_is_set(child[word], (uint8_t) bit); } } max_value_count = 0; for (j = 0; j < num_values; j++) { max_value_count = TSK_MAX(max_value_count, value_count[j]); } Au = A + ((size_t) u * num_words); for (word = 0; word < num_words; word++) { Au[word] = 0; } for (j = 0; j < num_values; j++) { if (value_count[j] == max_value_count) { word = j / 64; bit = j % 64; Au[word] = set_bit(Au[word], (uint8_t) bit); } } } static void compute_optimal_value(uint64_t *restrict A, const tsk_id_t *restrict left_child, const tsk_id_t *restrict right_sib, const tsk_id_t u, const tsk_id_t parent_state, const tsk_size_t num_values, const tsk_size_t num_words) { if (num_words == 1) { compute_optimal_value_1(A, left_child, right_sib, u, parent_state, num_values); } else { compute_optimal_value_general( A, left_child, right_sib, u, parent_state, num_values, num_words); } } static int tsk_ls_hmm_setup_optimal_value_sets(tsk_ls_hmm_t *self) { int ret = 0; /* We expect that most of the time there will be one word per optimal_value set, * but there will be times when we need more than one word. This approach * lets us expand the memory if we need to, but when the number of * values goes back below 64 we revert to using one word per set. We * could in principle release back the memory as well, but it doesn't seem * worth the bother. */ self->num_optimal_value_set_words = (self->num_values / 64) + 1; if (self->num_optimal_value_set_words > self->max_parsimony_words) { ret = tsk_trace_error(TSK_ERR_TOO_MANY_VALUES); goto out; } if (self->num_values >= self->max_values) { self->max_values = self->num_optimal_value_set_words * 64; tsk_safe_free(self->optimal_value_sets); self->optimal_value_sets = tsk_calloc(self->num_nodes * self->num_optimal_value_set_words, sizeof(*self->optimal_value_sets)); if (self->optimal_value_sets == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } } out: return ret; } static int tsk_ls_hmm_build_optimal_value_sets(tsk_ls_hmm_t *self) { int ret = 0; const double *restrict node_time = self->tree_sequence->tables->nodes.time; const tsk_id_t *restrict left_child = self->tree.left_child; const tsk_id_t *restrict right_sib = self->tree.right_sib; const tsk_id_t *restrict parent = self->parent; const tsk_value_transition_t *restrict T = self->transitions; const tsk_id_t *restrict T_index = self->transition_index; tsk_argsort_t *restrict order = self->transition_time_order; const tsk_size_t num_optimal_value_set_words = self->num_optimal_value_set_words; uint64_t *restrict A = self->optimal_value_sets; tsk_size_t j; tsk_id_t u, v, state, parent_state; /* argsort the transitions by node time so we can visit them in the * correct order */ for (j = 0; j < self->num_transitions; j++) { order[j].index = j; order[j].value = DBL_MAX; if (T[j].tree_node != TSK_NULL) { order[j].value = node_time[T[j].tree_node]; } } qsort(order, (size_t) self->num_transitions, sizeof(*order), cmp_argsort); for (j = 0; j < self->num_transitions; j++) { u = T[order[j].index].tree_node; if (u != TSK_NULL) { state = T[order[j].index].value_index; if (left_child[u] == TSK_NULL) { /* leaf node */ set_optimal_value(A, u, num_optimal_value_set_words, state); } else { compute_optimal_value(A, left_child, right_sib, u, state, self->num_values, num_optimal_value_set_words); } v = parent[u]; if (v != TSK_NULL) { while (T_index[v] == TSK_NULL) { v = parent[v]; tsk_bug_assert(v != TSK_NULL); } parent_state = T[T_index[v]].value_index; v = parent[u]; while (T_index[v] == TSK_NULL) { compute_optimal_value(A, left_child, right_sib, v, parent_state, self->num_values, num_optimal_value_set_words); v = parent[v]; tsk_bug_assert(v != TSK_NULL); } } } } return ret; } static int tsk_ls_hmm_redistribute_transitions(tsk_ls_hmm_t *self) { int ret = 0; const tsk_id_t *restrict left_child = self->tree.left_child; const tsk_id_t *restrict right_sib = self->tree.right_sib; const tsk_id_t *restrict parent = self->parent; tsk_id_t *restrict T_index = self->transition_index; tsk_id_t *restrict T_parent = self->transition_parent; tsk_value_transition_t *restrict T = self->transitions; tsk_value_transition_t *restrict T_old = self->transitions_copy; tsk_transition_stack_t *stack = self->transition_stack; uint64_t *restrict A = self->optimal_value_sets; const tsk_size_t num_optimal_value_set_words = self->num_optimal_value_set_words; tsk_transition_stack_t s, child_s; tsk_id_t root, u, v; int stack_top = 0; tsk_size_t j, old_num_transitions; tsk_memcpy(T_old, T, self->num_transitions * sizeof(*T)); old_num_transitions = self->num_transitions; self->num_transitions = 0; /* TODO refactor this to push the virtual root onto the stack rather then * iterating over the roots. See the existing parsimony implementations * for an example. */ for (root = tsk_tree_get_left_root(&self->tree); root != TSK_NULL; root = right_sib[root]) { stack[0].tree_node = root; stack[0].old_state = T_old[T_index[root]].value_index; stack[0].new_state = get_smallest_element(A, (tsk_size_t) root, num_optimal_value_set_words); stack[0].transition_parent = 0; stack_top = 0; tsk_bug_assert(self->num_transitions < self->max_transitions); T_parent[self->num_transitions] = TSK_NULL; T[self->num_transitions].tree_node = stack[0].tree_node; T[self->num_transitions].value_index = stack[0].new_state; self->num_transitions++; while (stack_top >= 0) { s = stack[stack_top]; stack_top--; for (v = left_child[s.tree_node]; v != TSK_NULL; v = right_sib[v]) { child_s = s; child_s.tree_node = v; if (T_index[v] != TSK_NULL) { child_s.old_state = T_old[T_index[v]].value_index; } if (!all_zero(A, v, num_optimal_value_set_words)) { if (!element_in(A, v, s.new_state, num_optimal_value_set_words)) { child_s.new_state = get_smallest_element( A, (tsk_size_t) v, num_optimal_value_set_words); child_s.transition_parent = (tsk_id_t) self->num_transitions; /* Add a new transition */ tsk_bug_assert(self->num_transitions < self->max_transitions); T_parent[self->num_transitions] = s.transition_parent; T[self->num_transitions].tree_node = v; T[self->num_transitions].value_index = child_s.new_state; self->num_transitions++; } stack_top++; stack[stack_top] = child_s; } else { /* Node that we didn't visit when moving up the tree */ if (s.old_state != s.new_state) { tsk_bug_assert(self->num_transitions < self->max_transitions); T_parent[self->num_transitions] = s.transition_parent; T[self->num_transitions].tree_node = v; T[self->num_transitions].value_index = s.old_state; self->num_transitions++; } } } } } /* Unset the old T_index pointers and optimal_value sets. */ for (j = 0; j < old_num_transitions; j++) { u = T_old[j].tree_node; if (u != TSK_NULL) { T_index[u] = TSK_NULL; while (u != TSK_NULL && !all_zero(A, u, num_optimal_value_set_words)) { tsk_memset(A + ((tsk_size_t) u) * num_optimal_value_set_words, 0, num_optimal_value_set_words * sizeof(uint64_t)); u = parent[u]; } } } /* Set the new pointers for transition nodes and the values.*/ for (j = 0; j < self->num_transitions; j++) { T_index[T[j].tree_node] = (tsk_id_t) j; T[j].value = self->values[T[j].value_index]; } return ret; } static int tsk_ls_hmm_compress(tsk_ls_hmm_t *self) { int ret = 0; ret = tsk_ls_hmm_discretise_values(self); if (ret != 0) { goto out; } ret = tsk_ls_hmm_setup_optimal_value_sets(self); if (ret != 0) { goto out; } ret = tsk_ls_hmm_build_optimal_value_sets(self); if (ret != 0) { goto out; } ret = tsk_ls_hmm_redistribute_transitions(self); if (ret != 0) { goto out; } out: return ret; } static int tsk_ls_hmm_process_site_forward( tsk_ls_hmm_t *self, const tsk_site_t *site, int32_t haplotype_state) { int ret = 0; double x, normalisation_factor; tsk_compressed_matrix_t *output = (tsk_compressed_matrix_t *) self->output; tsk_value_transition_t *restrict T = self->transitions; const unsigned int precision = (unsigned int) self->precision; tsk_size_t j; ret = tsk_ls_hmm_update_probabilities(self, site, haplotype_state); if (ret != 0) { goto out; } /* See notes in the Python implementation on why we don't want to compress * here, but rather should be doing it after rounding. */ ret = tsk_ls_hmm_compress(self); if (ret != 0) { goto out; } tsk_bug_assert(self->num_transitions <= self->num_samples); normalisation_factor = self->compute_normalisation_factor(self); if (normalisation_factor == 0) { ret = tsk_trace_error(TSK_ERR_MATCH_IMPOSSIBLE); goto out; } for (j = 0; j < self->num_transitions; j++) { tsk_bug_assert(T[j].tree_node != TSK_NULL); x = T[j].value / normalisation_factor; T[j].value = tsk_round(x, precision); } ret = tsk_compressed_matrix_store_site( output, site->id, normalisation_factor, (tsk_size_t) self->num_transitions, T); out: return ret; } static int tsk_ls_hmm_run_forward(tsk_ls_hmm_t *self, int32_t *haplotype) { int ret = 0; int t_ret; const tsk_site_t *sites; tsk_size_t j, num_sites; const double n = (double) self->num_samples; ret = tsk_ls_hmm_reset(self, 1 / n); if (ret != 0) { goto out; } for (t_ret = tsk_tree_first(&self->tree); t_ret == TSK_TREE_OK; t_ret = tsk_tree_next(&self->tree)) { ret = tsk_ls_hmm_update_tree(self, TSK_DIR_FORWARD); if (ret != 0) { goto out; } /* tsk_ls_hmm_check_state(self); */ ret = tsk_tree_get_sites(&self->tree, &sites, &num_sites); if (ret != 0) { goto out; } for (j = 0; j < num_sites; j++) { ret = tsk_ls_hmm_process_site_forward( self, &sites[j], haplotype[sites[j].id]); if (ret != 0) { goto out; } } } /* Set to zero so we can print and check the state OK. */ self->num_transitions = 0; if (t_ret != 0) { ret = t_ret; goto out; } out: return ret; } /**************************************************************** * Forward Algorithm ****************************************************************/ static double tsk_ls_hmm_compute_normalisation_factor_forward(tsk_ls_hmm_t *self) { tsk_size_t *restrict N = self->num_transition_samples; tsk_value_transition_t *restrict T = self->transitions; const tsk_id_t *restrict T_parent = self->transition_parent; const tsk_size_t *restrict num_samples = self->tree.num_samples; const tsk_id_t num_transitions = (tsk_id_t) self->num_transitions; double normalisation_factor; tsk_id_t j; /* Compute the number of samples directly inheriting from each transition */ for (j = 0; j < num_transitions; j++) { tsk_bug_assert(T[j].tree_node != TSK_NULL); N[j] = num_samples[T[j].tree_node]; } for (j = 0; j < num_transitions; j++) { if (T_parent[j] != TSK_NULL) { N[T_parent[j]] -= N[j]; } } /* Compute the normalising constant used to avoid underflow */ normalisation_factor = 0; for (j = 0; j < num_transitions; j++) { normalisation_factor += (double) N[j] * T[j].value; } return normalisation_factor; } static int tsk_ls_hmm_next_probability_forward(tsk_ls_hmm_t *self, tsk_id_t site_id, double p_last, bool is_match, tsk_id_t TSK_UNUSED(node), double *result) { const double rho = self->recombination_rate[site_id]; const double mu = self->mutation_rate[site_id]; const double n = (double) self->num_samples; const double num_alleles = self->num_alleles[site_id]; double p_t, p_e; p_t = p_last * (1 - rho) + rho / n; p_e = mu; if (is_match) { p_e = 1 - (num_alleles - 1) * mu; } *result = p_t * p_e; return 0; } int tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_compressed_matrix_t *output, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_compressed_matrix_init(output, self->tree_sequence, 0, 0); if (ret != 0) { goto out; } } else { if (output->tree_sequence != self->tree_sequence) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = tsk_compressed_matrix_clear(output); if (ret != 0) { goto out; } } self->next_probability = tsk_ls_hmm_next_probability_forward; self->compute_normalisation_factor = tsk_ls_hmm_compute_normalisation_factor_forward; self->output = output; ret = tsk_ls_hmm_run_forward(self, haplotype); out: return ret; } /**************************************************************** * Backward Algorithm ****************************************************************/ static int tsk_ls_hmm_next_probability_backward(tsk_ls_hmm_t *self, tsk_id_t site_id, double p_last, bool is_match, tsk_id_t TSK_UNUSED(node), double *result) { const double mu = self->mutation_rate[site_id]; const double num_alleles = self->num_alleles[site_id]; double p_e; p_e = mu; if (is_match) { p_e = 1 - (num_alleles - 1) * mu; } *result = p_last * p_e; return 0; } static int tsk_ls_hmm_process_site_backward(tsk_ls_hmm_t *self, const tsk_site_t *site, const int32_t haplotype_state, const double normalisation_factor) { int ret = 0; double x, b_last_sum; tsk_compressed_matrix_t *output = (tsk_compressed_matrix_t *) self->output; tsk_value_transition_t *restrict T = self->transitions; const unsigned int precision = (unsigned int) self->precision; const double rho = self->recombination_rate[site->id]; const double n = (double) self->num_samples; tsk_size_t j; /* FIXME!!! We are calling compress twice here because we need to compress * immediately before calling store_site in order to filter out -1 nodes, * and also (crucially) to ensure that the value transitions are listed * in preorder, which we rely on later for decoding. * * https://github.com/tskit-dev/tskit/issues/2803 */ ret = tsk_ls_hmm_compress(self); if (ret != 0) { goto out; } ret = tsk_compressed_matrix_store_site( output, site->id, normalisation_factor, (tsk_size_t) self->num_transitions, T); if (ret != 0) { goto out; } ret = tsk_ls_hmm_update_probabilities(self, site, haplotype_state); if (ret != 0) { goto out; } /* DO WE NEED THIS compress?? See above */ ret = tsk_ls_hmm_compress(self); if (ret != 0) { goto out; } tsk_bug_assert(self->num_transitions <= self->num_samples); b_last_sum = self->compute_normalisation_factor(self); for (j = 0; j < self->num_transitions; j++) { tsk_bug_assert(T[j].tree_node != TSK_NULL); x = rho * b_last_sum / n + (1 - rho) * T[j].value; x /= normalisation_factor; T[j].value = tsk_round(x, precision); } out: return ret; } static int tsk_ls_hmm_run_backward( tsk_ls_hmm_t *self, int32_t *haplotype, const double *forward_norm) { int ret = 0; int t_ret; const tsk_site_t *sites; double s; tsk_size_t num_sites; tsk_id_t j; ret = tsk_ls_hmm_reset(self, 1); if (ret != 0) { goto out; } for (t_ret = tsk_tree_last(&self->tree); t_ret == TSK_TREE_OK; t_ret = tsk_tree_prev(&self->tree)) { ret = tsk_ls_hmm_update_tree(self, TSK_DIR_REVERSE); if (ret != 0) { goto out; } /* tsk_ls_hmm_check_state(self); */ ret = tsk_tree_get_sites(&self->tree, &sites, &num_sites); if (ret != 0) { goto out; } for (j = (tsk_id_t) num_sites - 1; j >= 0; j--) { s = forward_norm[sites[j].id]; if (s <= 0) { /* NOTE: I'm not sure if this is the correct interpretation, * but norm values of 0 do lead to problems, and this seems * like a simple way of guarding against it. We do seem to * get norm values of 0 with impossible matches from the fwd * matrix. */ ret = tsk_trace_error(TSK_ERR_MATCH_IMPOSSIBLE); goto out; } ret = tsk_ls_hmm_process_site_backward( self, &sites[j], haplotype[sites[j].id], s); if (ret != 0) { goto out; } } } /* Set to zero so we can print and check the state OK. */ self->num_transitions = 0; if (t_ret != 0) { ret = t_ret; goto out; } out: return ret; } int tsk_ls_hmm_backward(tsk_ls_hmm_t *self, int32_t *haplotype, const double *forward_norm, tsk_compressed_matrix_t *output, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_compressed_matrix_init(output, self->tree_sequence, 0, 0); if (ret != 0) { goto out; } } else { if (output->tree_sequence != self->tree_sequence) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = tsk_compressed_matrix_clear(output); if (ret != 0) { goto out; } } self->next_probability = tsk_ls_hmm_next_probability_backward; self->compute_normalisation_factor = tsk_ls_hmm_compute_normalisation_factor_forward; self->output = output; ret = tsk_ls_hmm_run_backward(self, haplotype, forward_norm); out: return ret; } /**************************************************************** * Viterbi Algorithm ****************************************************************/ static double tsk_ls_hmm_compute_normalisation_factor_viterbi(tsk_ls_hmm_t *self) { tsk_value_transition_t *restrict T = self->transitions; const tsk_id_t num_transitions = (tsk_id_t) self->num_transitions; tsk_value_transition_t max_vt; tsk_id_t j; max_vt.value = -1; max_vt.tree_node = 0; /* keep compiler happy */ tsk_bug_assert(num_transitions > 0); for (j = 0; j < num_transitions; j++) { tsk_bug_assert(T[j].tree_node != TSK_NULL); if (T[j].value > max_vt.value) { max_vt = T[j]; } } return max_vt.value; } static int tsk_ls_hmm_next_probability_viterbi(tsk_ls_hmm_t *self, tsk_id_t site, double p_last, bool is_match, tsk_id_t node, double *result) { const double rho = self->recombination_rate[site]; const double mu = self->mutation_rate[site]; const double num_alleles = self->num_alleles[site]; const double n = (double) self->num_samples; double p_recomb, p_no_recomb, p_t, p_e; bool recombination_required = false; p_no_recomb = p_last * (1 - rho + rho / n); p_recomb = rho / n; if (p_no_recomb > p_recomb) { p_t = p_no_recomb; } else { p_t = p_recomb; recombination_required = true; } p_e = mu; if (is_match) { p_e = 1 - (num_alleles - 1) * mu; } *result = p_t * p_e; return tsk_viterbi_matrix_add_recombination_required( self->output, site, node, recombination_required); } int tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_viterbi_matrix_t *output, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_viterbi_matrix_init(output, self->tree_sequence, 0, 0); if (ret != 0) { goto out; } } else { if (output->matrix.tree_sequence != self->tree_sequence) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = tsk_viterbi_matrix_clear(output); if (ret != 0) { goto out; } } self->next_probability = tsk_ls_hmm_next_probability_viterbi; self->compute_normalisation_factor = tsk_ls_hmm_compute_normalisation_factor_viterbi; self->output = output; ret = tsk_ls_hmm_run_forward(self, haplotype); out: return ret; } /**************************************************************** * Compressed matrix ****************************************************************/ int tsk_compressed_matrix_init(tsk_compressed_matrix_t *self, tsk_treeseq_t *tree_sequence, tsk_size_t block_size, tsk_flags_t options) { int ret = 0; tsk_memset(self, 0, sizeof(*self)); self->tree_sequence = tree_sequence; self->options = options; self->num_sites = tsk_treeseq_get_num_sites(tree_sequence); self->num_samples = tsk_treeseq_get_num_samples(tree_sequence); self->num_transitions = tsk_malloc(self->num_sites * sizeof(*self->num_transitions)); self->normalisation_factor = tsk_malloc(self->num_sites * sizeof(*self->normalisation_factor)); self->values = tsk_malloc(self->num_sites * sizeof(*self->values)); self->nodes = tsk_malloc(self->num_sites * sizeof(*self->nodes)); if (self->num_transitions == NULL || self->values == NULL || self->nodes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (block_size == 0) { block_size = 1 << 20; } ret = tsk_blkalloc_init(&self->memory, (size_t) block_size); if (ret != 0) { goto out; } ret = tsk_compressed_matrix_clear(self); out: return ret; } int tsk_compressed_matrix_free(tsk_compressed_matrix_t *self) { tsk_blkalloc_free(&self->memory); tsk_safe_free(self->num_transitions); tsk_safe_free(self->normalisation_factor); tsk_safe_free(self->values); tsk_safe_free(self->nodes); return 0; } int tsk_compressed_matrix_clear(tsk_compressed_matrix_t *self) { tsk_blkalloc_reset(&self->memory); tsk_memset( self->num_transitions, 0, self->num_sites * sizeof(*self->num_transitions)); tsk_memset(self->normalisation_factor, 0, self->num_sites * sizeof(*self->normalisation_factor)); return 0; } void tsk_compressed_matrix_print_state(tsk_compressed_matrix_t *self, FILE *out) { tsk_size_t l, j; fprintf(out, "Compressed matrix for %p\n", (void *) self->tree_sequence); fprintf(out, "num_sites = %lld\n", (long long) self->num_sites); fprintf(out, "num_samples = %lld\n", (long long) self->num_samples); for (l = 0; l < self->num_sites; l++) { fprintf(out, "%lld\ts=%f\tv=%lld [", (long long) l, self->normalisation_factor[l], (long long) self->num_transitions[l]); for (j = 0; j < self->num_transitions[l]; j++) { fprintf( out, "(%lld, %f)", (long long) self->nodes[l][j], self->values[l][j]); if (j < self->num_transitions[l] - 1) { fprintf(out, ","); } else { fprintf(out, "]\n"); } } } fprintf(out, "Memory:\n"); tsk_blkalloc_print_state(&self->memory, out); } int tsk_compressed_matrix_store_site(tsk_compressed_matrix_t *self, tsk_id_t site, double normalisation_factor, tsk_size_t num_transitions, const tsk_value_transition_t *transitions) { int ret = 0; tsk_size_t j; if (site < 0 || site >= (tsk_id_t) self->num_sites) { ret = tsk_trace_error(TSK_ERR_SITE_OUT_OF_BOUNDS); goto out; } self->num_transitions[site] = num_transitions; self->normalisation_factor[site] = normalisation_factor; self->nodes[site] = tsk_blkalloc_get(&self->memory, (size_t) num_transitions * sizeof(tsk_id_t)); self->values[site] = tsk_blkalloc_get(&self->memory, (size_t) num_transitions * sizeof(double)); if (self->nodes[site] == NULL || self->values[site] == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < num_transitions; j++) { tsk_bug_assert(transitions[j].tree_node >= 0); self->values[site][j] = transitions[j].value; self->nodes[site][j] = transitions[j].tree_node; } out: return ret; } static int tsk_compressed_matrix_decode_site(tsk_compressed_matrix_t *self, const tsk_tree_t *tree, const tsk_id_t site, double *values) { int ret = 0; const tsk_id_t *restrict list_left = tree->left_sample; const tsk_id_t *restrict list_right = tree->right_sample; const tsk_id_t *restrict list_next = tree->next_sample; const tsk_id_t num_nodes = (tsk_id_t) tsk_treeseq_get_num_nodes(self->tree_sequence); tsk_size_t j; tsk_id_t node, index, stop; double value; for (j = 0; j < self->num_transitions[site]; j++) { node = self->nodes[site][j]; if (node < 0 || node >= num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } value = self->values[site][j]; index = list_left[node]; if (index == TSK_NULL) { /* It's an error if there are nodes that don't subtend any samples */ ret = tsk_trace_error(TSK_ERR_BAD_COMPRESSED_MATRIX_NODE); goto out; } stop = list_right[node]; while (true) { values[index] = value; if (index == stop) { break; } index = list_next[index]; } } out: return ret; } int tsk_compressed_matrix_decode(tsk_compressed_matrix_t *self, double *values) { int ret = 0; int t_ret; tsk_tree_t tree; tsk_size_t j, num_tree_sites; const tsk_site_t *sites = NULL; tsk_id_t site_id; double *site_array; ret = tsk_tree_init(&tree, self->tree_sequence, TSK_SAMPLE_LISTS); if (ret != 0) { goto out; } for (t_ret = tsk_tree_first(&tree); t_ret == TSK_TREE_OK; t_ret = tsk_tree_next(&tree)) { ret = tsk_tree_get_sites(&tree, &sites, &num_tree_sites); if (ret != 0) { goto out; } for (j = 0; j < num_tree_sites; j++) { site_id = sites[j].id; site_array = values + ((tsk_size_t) site_id) * self->num_samples; if (self->num_transitions[site_id] == 0) { tsk_memset(site_array, 0, self->num_samples * sizeof(*site_array)); } else { ret = tsk_compressed_matrix_decode_site( self, &tree, site_id, site_array); if (ret != 0) { goto out; } } } } if (t_ret < 0) { ret = t_ret; goto out; } out: tsk_tree_free(&tree); return ret; } /**************************************************************** * Viterbi matrix ****************************************************************/ static int tsk_viterbi_matrix_expand_recomb_records(tsk_viterbi_matrix_t *self) { int ret = 0; tsk_recomb_required_record *tmp = tsk_realloc( self->recombination_required, self->max_recomb_records * sizeof(*tmp)); if (tmp == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->recombination_required = tmp; out: return ret; } int tsk_viterbi_matrix_init(tsk_viterbi_matrix_t *self, tsk_treeseq_t *tree_sequence, tsk_size_t block_size, tsk_flags_t options) { int ret = 0; tsk_memset(self, 0, sizeof(*self)); if (block_size == 0) { block_size = 1 << 20; /* 1MiB */ } ret = tsk_compressed_matrix_init(&self->matrix, tree_sequence, block_size, options); if (ret != 0) { goto out; } self->max_recomb_records = TSK_MAX(1, block_size / sizeof(tsk_recomb_required_record)); ret = tsk_viterbi_matrix_expand_recomb_records(self); if (ret != 0) { goto out; } /* Add the sentinel at the start to simplify traceback */ self->recombination_required[0].site = -1; ret = tsk_viterbi_matrix_clear(self); out: return ret; } int tsk_viterbi_matrix_free(tsk_viterbi_matrix_t *self) { tsk_compressed_matrix_free(&self->matrix); tsk_safe_free(self->recombination_required); return 0; } int tsk_viterbi_matrix_clear(tsk_viterbi_matrix_t *self) { self->num_recomb_records = 1; tsk_compressed_matrix_clear(&self->matrix); return 0; } void tsk_viterbi_matrix_print_state(tsk_viterbi_matrix_t *self, FILE *out) { tsk_id_t l, j; fprintf(out, "viterbi_matrix\n"); fprintf(out, "num_recomb_records = %lld\n", (long long) self->num_recomb_records); fprintf(out, "max_recomb_records = %lld\n", (long long) self->max_recomb_records); j = 1; for (l = 0; l < (tsk_id_t) self->matrix.num_sites; l++) { fprintf(out, "%lld\t[", (long long) l); while (j < (tsk_id_t) self->num_recomb_records && self->recombination_required[j].site == l) { fprintf(out, "(%lld, %d) ", (long long) self->recombination_required[j].node, self->recombination_required[j].required); j++; } fprintf(out, "]\n"); } tsk_compressed_matrix_print_state(&self->matrix, out); } TSK_WARN_UNUSED int tsk_viterbi_matrix_add_recombination_required( tsk_viterbi_matrix_t *self, tsk_id_t site, tsk_id_t node, bool required) { int ret = 0; tsk_recomb_required_record *record; if (self->num_recomb_records == self->max_recomb_records) { self->max_recomb_records *= 2; ret = tsk_viterbi_matrix_expand_recomb_records(self); if (ret != 0) { goto out; } } record = self->recombination_required + self->num_recomb_records; record->site = site; record->node = node; record->required = required; self->num_recomb_records++; out: return ret; } static tsk_id_t tsk_viterbi_matrix_choose_sample( tsk_viterbi_matrix_t *self, tsk_id_t site, tsk_tree_t *tree) { tsk_id_t ret; tsk_id_t u = TSK_NULL; const tsk_flags_t *node_flags = self->matrix.tree_sequence->tables->nodes.flags; const tsk_size_t num_transitions = self->matrix.num_transitions[site]; const tsk_id_t *transition_nodes = self->matrix.nodes[site]; const double *transition_values = self->matrix.values[site]; double max_value = -1; tsk_size_t j; tsk_id_t v; bool found; if (num_transitions == 0) { ret = tsk_trace_error(TSK_ERR_NULL_VITERBI_MATRIX); goto out; } for (j = 0; j < num_transitions; j++) { if (max_value < transition_values[j]) { u = transition_nodes[j]; max_value = transition_values[j]; } } tsk_bug_assert(u != TSK_NULL); while (!(node_flags[u] & TSK_NODE_IS_SAMPLE)) { found = false; for (v = tree->left_child[u]; v != TSK_NULL; v = tree->right_sib[v]) { /* Choose the first child that is not in the list of transition nodes */ for (j = 0; j < num_transitions; j++) { if (transition_nodes[j] == v) { break; } } if (j == num_transitions) { u = v; found = true; break; } } /* TODO: should remove this once we're sure this is robust */ tsk_bug_assert(found); } ret = u; out: return ret; } int tsk_viterbi_matrix_traceback( tsk_viterbi_matrix_t *self, tsk_id_t *path, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_site_t site; tsk_id_t u, site_id, current_node; tsk_recomb_required_record *rr_record, *rr_record_tmp; const tsk_id_t num_sites = (tsk_id_t) self->matrix.num_sites; const tsk_id_t num_nodes = (tsk_id_t) tsk_treeseq_get_num_nodes(self->matrix.tree_sequence); tsk_tree_t tree; tsk_id_t *recombination_tree = tsk_malloc((size_t) num_nodes * sizeof(*recombination_tree)); ret = tsk_tree_init(&tree, self->matrix.tree_sequence, 0); if (ret != 0) { goto out; } if (recombination_tree == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } /* Initialise the path an recombination_tree to contain TSK_NULL */ tsk_memset(path, 0xff, ((size_t) num_sites) * sizeof(*path)); tsk_memset(recombination_tree, 0xff, ((size_t) num_nodes) * sizeof(*path)); current_node = TSK_NULL; rr_record = &self->recombination_required[self->num_recomb_records - 1]; ret = tsk_tree_last(&tree); if (ret < 0) { goto out; } for (site_id = num_sites - 1; site_id >= 0; site_id--) { ret = tsk_treeseq_get_site(self->matrix.tree_sequence, site_id, &site); if (ret != 0) { goto out; } while (tree.interval.left > site.position) { ret = tsk_tree_prev(&tree); if (ret < 0) { goto out; } } tsk_bug_assert(tree.interval.left <= site.position); tsk_bug_assert(site.position < tree.interval.right); /* Fill in the recombination tree */ rr_record_tmp = rr_record; while (rr_record->site == site.id) { recombination_tree[rr_record->node] = rr_record->required; rr_record--; } if (current_node == TSK_NULL) { current_node = tsk_viterbi_matrix_choose_sample(self, site.id, &tree); if (current_node < 0) { ret = (int) current_node; goto out; } } path[site.id] = current_node; /* Now traverse up the tree from the current node. The * first marked node tells us whether we need to recombine */ u = current_node; while (u != TSK_NULL && recombination_tree[u] == TSK_NULL) { u = tree.parent[u]; } tsk_bug_assert(u != TSK_NULL); if (recombination_tree[u] == 1) { /* Switch at the next site */ current_node = TSK_NULL; } /* Reset in the recombination tree */ rr_record = rr_record_tmp; while (rr_record->site == site.id) { recombination_tree[rr_record->node] = TSK_NULL; rr_record--; } } ret = 0; out: tsk_tree_free(&tree); tsk_safe_free(recombination_tree); return ret; } ================================================ FILE: c/tskit/haplotype_matching.h ================================================ /* * MIT License * * Copyright (c) 2019-2024 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef TSK_HAPLOTYPE_MATCHING_H #define TSK_HAPLOTYPE_MATCHING_H #ifdef __cplusplus extern "C" { #endif #include /* Seems like we might use this somewhere else as well, so putting it into the middle * of the flags space */ #define TSK_ALLELES_ACGT (1 << 16) typedef struct { tsk_id_t tree_node; tsk_id_t value_index; double value; } tsk_value_transition_t; typedef struct { tsk_size_t index; double value; } tsk_argsort_t; typedef struct { tsk_id_t tree_node; tsk_id_t old_state; tsk_id_t new_state; tsk_id_t transition_parent; } tsk_transition_stack_t; typedef struct { double normalisation_factor; double *value; tsk_id_t *node; tsk_size_t num_values; } tsk_site_probability_t; typedef struct { tsk_treeseq_t *tree_sequence; tsk_flags_t options; tsk_size_t num_sites; tsk_size_t num_samples; double *normalisation_factor; tsk_size_t *num_transitions; double **values; tsk_id_t **nodes; tsk_blkalloc_t memory; } tsk_compressed_matrix_t; typedef struct { tsk_id_t site; tsk_id_t node; bool required; } tsk_recomb_required_record; typedef struct { tsk_compressed_matrix_t matrix; tsk_recomb_required_record *recombination_required; tsk_size_t num_recomb_records; tsk_size_t max_recomb_records; } tsk_viterbi_matrix_t; typedef struct _tsk_ls_hmm_t { /* input */ tsk_treeseq_t *tree_sequence; double *recombination_rate; double *mutation_rate; const char ***alleles; unsigned int precision; uint32_t *num_alleles; tsk_size_t num_samples; tsk_size_t num_sites; tsk_size_t num_nodes; /* state */ tsk_tree_t tree; tsk_id_t *parent; /* The probability value transitions on the tree */ tsk_value_transition_t *transitions; tsk_value_transition_t *transitions_copy; /* Stack used when distributing transitions on the tree */ tsk_transition_stack_t *transition_stack; /* Map of node_id to index in the transitions list */ tsk_id_t *transition_index; /* Buffer used to argsort the transitions by node time */ tsk_argsort_t *transition_time_order; tsk_size_t num_transitions; tsk_size_t max_transitions; /* The distinct values in the transitions */ double *values; tsk_size_t num_values; tsk_size_t max_values; tsk_size_t max_parsimony_words; /* Number of machine words per node optimal value set. */ tsk_size_t num_optimal_value_set_words; uint64_t *optimal_value_sets; /* The parent transition; used during compression */ tsk_id_t *transition_parent; /* The number of samples directly subtended by a transition */ tsk_size_t *num_transition_samples; int32_t *allelic_state; /* Algorithms set these values before they are run */ int (*next_probability)( struct _tsk_ls_hmm_t *, tsk_id_t, double, bool, tsk_id_t, double *); double (*compute_normalisation_factor)(struct _tsk_ls_hmm_t *); void *output; } tsk_ls_hmm_t; /* TODO constify these APIs */ int tsk_ls_hmm_init(tsk_ls_hmm_t *self, tsk_treeseq_t *tree_sequence, double *recombination_rate, double *mutation_rate, tsk_flags_t options); int tsk_ls_hmm_set_precision(tsk_ls_hmm_t *self, unsigned int precision); int tsk_ls_hmm_free(tsk_ls_hmm_t *self); void tsk_ls_hmm_print_state(tsk_ls_hmm_t *self, FILE *out); int tsk_ls_hmm_forward(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_compressed_matrix_t *output, tsk_flags_t options); int tsk_ls_hmm_backward(tsk_ls_hmm_t *self, int32_t *haplotype, const double *forward_norm, tsk_compressed_matrix_t *output, tsk_flags_t options); int tsk_ls_hmm_viterbi(tsk_ls_hmm_t *self, int32_t *haplotype, tsk_viterbi_matrix_t *output, tsk_flags_t options); int tsk_compressed_matrix_init(tsk_compressed_matrix_t *self, tsk_treeseq_t *tree_sequence, tsk_size_t block_size, tsk_flags_t options); int tsk_compressed_matrix_free(tsk_compressed_matrix_t *self); int tsk_compressed_matrix_clear(tsk_compressed_matrix_t *self); void tsk_compressed_matrix_print_state(tsk_compressed_matrix_t *self, FILE *out); int tsk_compressed_matrix_store_site(tsk_compressed_matrix_t *self, tsk_id_t site, double normalisation_factor, tsk_size_t num_transitions, const tsk_value_transition_t *transitions); int tsk_compressed_matrix_decode(tsk_compressed_matrix_t *self, double *values); int tsk_viterbi_matrix_init(tsk_viterbi_matrix_t *self, tsk_treeseq_t *tree_sequence, tsk_size_t block_size, tsk_flags_t options); int tsk_viterbi_matrix_free(tsk_viterbi_matrix_t *self); int tsk_viterbi_matrix_clear(tsk_viterbi_matrix_t *self); void tsk_viterbi_matrix_print_state(tsk_viterbi_matrix_t *self, FILE *out); int tsk_viterbi_matrix_add_recombination_required( tsk_viterbi_matrix_t *self, tsk_id_t site, tsk_id_t node, bool required); int tsk_viterbi_matrix_traceback( tsk_viterbi_matrix_t *self, tsk_id_t *path, tsk_flags_t options); #ifdef __cplusplus } #endif #endif ================================================ FILE: c/tskit/stats.c ================================================ /* * MIT License * * Copyright (c) 2018-2025 Tskit Developers * Copyright (c) 2016-2017 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include void tsk_ld_calc_print_state(const tsk_ld_calc_t *self, FILE *out) { fprintf(out, "tree = %p\n", (const void *) &self->tree); fprintf(out, "max_sites = %d\n", (int) self->max_sites); fprintf(out, "max_distance = %f\n", self->max_distance); } int TSK_WARN_UNUSED tsk_ld_calc_init(tsk_ld_calc_t *self, const tsk_treeseq_t *tree_sequence) { int ret = 0; tsk_memset(self, 0, sizeof(*self)); ret = tsk_tree_init(&self->tree, tree_sequence, 0); if (ret != 0) { goto out; } self->tree_sequence = tree_sequence; self->total_samples = tsk_treeseq_get_num_samples(self->tree_sequence); self->sample_buffer = tsk_malloc(self->total_samples * sizeof(*self->sample_buffer)); if (self->sample_buffer == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } out: return ret; } int tsk_ld_calc_free(tsk_ld_calc_t *self) { tsk_tree_free(&self->tree); tsk_safe_free(self->sample_buffer); return 0; } static int tsk_ld_calc_check_site(tsk_ld_calc_t *TSK_UNUSED(self), const tsk_site_t *site) { int ret = 0; /* These are both limitations in the current implementation, there's no * fundamental reason why we can't support them */ if (site->mutations_length != 1) { ret = tsk_trace_error(TSK_ERR_ONLY_INFINITE_SITES); goto out; } if (site->ancestral_state_length == site->mutations[0].derived_state_length && tsk_memcmp(site->ancestral_state, site->mutations[0].derived_state, site->ancestral_state_length) == 0) { ret = tsk_trace_error(TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED); goto out; } out: return ret; } static int tsk_ld_calc_set_focal_samples(tsk_ld_calc_t *self) { int ret = 0; tsk_id_t focal_node = self->focal_site.mutations[0].node; ret = tsk_tree_track_descendant_samples(&self->tree, focal_node); if (ret != 0) { goto out; } self->focal_samples = self->tree.num_tracked_samples[focal_node]; out: return ret; } static int tsk_ld_calc_initialise(tsk_ld_calc_t *self, tsk_id_t a) { int ret = 0; ret = tsk_treeseq_get_site(self->tree_sequence, a, &self->focal_site); if (ret != 0) { goto out; } ret = tsk_ld_calc_check_site(self, &self->focal_site); if (ret != 0) { goto out; } ret = tsk_tree_seek(&self->tree, self->focal_site.position, 0); if (ret != 0) { goto out; } ret = tsk_ld_calc_set_focal_samples(self); if (ret != 0) { goto out; } out: return ret; } static int tsk_ld_calc_compute_r2(tsk_ld_calc_t *self, const tsk_site_t *target_site, double *r2) { const double n = (double) self->total_samples; double f_a, f_b, f_ab, D, denom; tsk_id_t node; int ret = tsk_ld_calc_check_site(self, target_site); if (ret != 0) { goto out; } node = target_site->mutations[0].node; f_a = ((double) self->focal_samples) / n; f_b = ((double) self->tree.num_samples[node]) / n; f_ab = ((double) self->tree.num_tracked_samples[node]) / n; D = f_ab - f_a * f_b; denom = f_a * f_b * (1 - f_a) * (1 - f_b); *r2 = (D * D) / denom; out: return ret; } static int tsk_ld_calc_compute_and_append( tsk_ld_calc_t *self, const tsk_site_t *target_site, bool *ret_done) { int ret = 0; double r2; double distance = fabs(self->focal_site.position - target_site->position); bool done = true; if (distance <= self->max_distance && self->result_length < self->max_sites) { ret = tsk_ld_calc_compute_r2(self, target_site, &r2); if (ret != 0) { goto out; } self->result[self->result_length] = r2; self->result_length++; done = false; } *ret_done = done; out: return ret; } static int tsk_ld_calc_run_forward(tsk_ld_calc_t *self) { int ret = 0; tsk_size_t j; bool done = false; for (j = 0; j < self->tree.sites_length; j++) { if (self->tree.sites[j].id > self->focal_site.id) { ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); if (ret != 0) { goto out; } if (done) { break; } } } while (((ret = tsk_tree_next(&self->tree)) == TSK_TREE_OK) && !done) { for (j = 0; j < self->tree.sites_length; j++) { ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); if (ret != 0) { goto out; } if (done) { break; } } } if (ret < 0) { goto out; } ret = 0; out: return ret; } static int tsk_ld_calc_run_reverse(tsk_ld_calc_t *self) { int ret = 0; tsk_id_t j; bool done = false; for (j = (tsk_id_t) self->tree.sites_length - 1; j >= 0; j--) { if (self->tree.sites[j].id < self->focal_site.id) { ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); if (ret != 0) { goto out; } if (done) { break; } } } while (((ret = tsk_tree_prev(&self->tree)) == TSK_TREE_OK) && !done) { for (j = (tsk_id_t) self->tree.sites_length - 1; j >= 0; j--) { ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); if (ret != 0) { goto out; } if (done) { break; } } } if (ret < 0) { goto out; } ret = 0; out: return ret; } int tsk_ld_calc_get_r2(tsk_ld_calc_t *self, tsk_id_t a, tsk_id_t b, double *r2) { int ret = 0; tsk_site_t target_site; ret = tsk_ld_calc_initialise(self, a); if (ret != 0) { goto out; } ret = tsk_treeseq_get_site(self->tree_sequence, b, &target_site); if (ret != 0) { goto out; } ret = tsk_tree_seek(&self->tree, target_site.position, 0); if (ret != 0) { goto out; } ret = tsk_ld_calc_compute_r2(self, &target_site, r2); if (ret != 0) { goto out; } out: return ret; } int tsk_ld_calc_get_r2_array(tsk_ld_calc_t *self, tsk_id_t a, int direction, tsk_size_t max_sites, double max_distance, double *r2, tsk_size_t *num_r2_values) { int ret = tsk_ld_calc_initialise(self, a); if (ret != 0) { goto out; } self->max_sites = max_sites; self->max_distance = max_distance; self->result_length = 0; self->result = r2; if (direction == TSK_DIR_FORWARD) { ret = tsk_ld_calc_run_forward(self); } else if (direction == TSK_DIR_REVERSE) { ret = tsk_ld_calc_run_reverse(self); } else { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); } if (ret != 0) { goto out; } *num_r2_values = self->result_length; out: return ret; } ================================================ FILE: c/tskit/stats.h ================================================ /* * MIT License * * Copyright (c) 2019-2021 Tskit Developers * Copyright (c) 2016-2017 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef TSK_STATS_H #define TSK_STATS_H #ifdef __cplusplus extern "C" { #endif #include typedef struct { const tsk_treeseq_t *tree_sequence; tsk_site_t focal_site; tsk_size_t total_samples; tsk_size_t focal_samples; double max_distance; tsk_size_t max_sites; tsk_tree_t tree; tsk_id_t *sample_buffer; double *result; tsk_size_t result_length; } tsk_ld_calc_t; int tsk_ld_calc_init(tsk_ld_calc_t *self, const tsk_treeseq_t *tree_sequence); int tsk_ld_calc_free(tsk_ld_calc_t *self); void tsk_ld_calc_print_state(const tsk_ld_calc_t *self, FILE *out); int tsk_ld_calc_get_r2(tsk_ld_calc_t *self, tsk_id_t a, tsk_id_t b, double *r2); int tsk_ld_calc_get_r2_array(tsk_ld_calc_t *self, tsk_id_t a, int direction, tsk_size_t max_sites, double max_distance, double *r2, tsk_size_t *num_r2_values); #ifdef __cplusplus } #endif #endif ================================================ FILE: c/tskit/tables.c ================================================ /* * MIT License * * Copyright (c) 2019-2025 Tskit Developers * Copyright (c) 2017-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #define TABLE_SEP "-----------------------------------------\n" #define TSK_COL_OPTIONAL (1 << 0) typedef struct { const char *name; void **array_dest; int type; tsk_flags_t options; } read_table_col_t; typedef struct { const char *name; void **data_array_dest; tsk_size_t *data_len_dest; int data_type; tsk_size_t **offset_array_dest; tsk_flags_t options; } read_table_ragged_col_t; typedef struct { const char *name; void **array_dest; tsk_size_t *len_dest; int type; tsk_flags_t options; } read_table_property_t; typedef struct { const char *name; const void *array; tsk_size_t len; int type; } write_table_col_t; typedef struct { const char *name; const void *data_array; tsk_size_t data_len; int data_type; const tsk_size_t *offset_array; tsk_size_t num_rows; } write_table_ragged_col_t; /* Returns true if adding the specified number of rows would result in overflow. * Tables can support indexes from 0 to TSK_MAX_ID, and therefore could have at most * TSK_MAX_ID + 1 rows. However we limit to TSK_MAX_ID rows so that counts of rows * can fit in a tsk_id_t. */ static bool check_table_overflow(tsk_size_t current_size, tsk_size_t additional_rows) { tsk_size_t max_val = TSK_MAX_ID; return additional_rows > max_val || current_size > (max_val - additional_rows); } /* Returns true if adding the specified number of elements would result in overflow * of an offset column. */ static bool check_offset_overflow(tsk_size_t current_size, tsk_size_t additional_elements) { tsk_size_t max_val = TSK_MAX_SIZE; return additional_elements > max_val || current_size > (max_val - additional_elements); } #define TSK_NUM_ROWS_UNSET ((tsk_size_t) - 1) #define TSK_MAX_COL_NAME_LEN 64 static int read_table_cols(kastore_t *store, tsk_size_t *num_rows, read_table_col_t *cols, tsk_flags_t TSK_UNUSED(flags)) { int ret = 0; size_t len; int type; read_table_col_t *col; for (col = cols; col->name != NULL; col++) { ret = kastore_containss(store, col->name); if (ret < 0) { ret = tsk_set_kas_error(ret); goto out; } if (ret == 1) { ret = kastore_gets(store, col->name, col->array_dest, &len, &type); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } if (*num_rows == TSK_NUM_ROWS_UNSET) { *num_rows = (tsk_size_t) len; } else { if (*num_rows != (tsk_size_t) len) { ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); goto out; } } if (type != col->type) { ret = tsk_trace_error(TSK_ERR_BAD_COLUMN_TYPE); goto out; } } else if (!(col->options & TSK_COL_OPTIONAL)) { ret = tsk_trace_error(TSK_ERR_REQUIRED_COL_NOT_FOUND); goto out; } } out: return ret; } static int cast_offset_array(read_table_ragged_col_t *col, uint32_t *source, tsk_size_t num_rows) { int ret = 0; tsk_size_t len = num_rows + 1; tsk_size_t j; uint64_t *dest = tsk_malloc(len * sizeof(*dest)); if (dest == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } *col->offset_array_dest = dest; for (j = 0; j < len; j++) { dest[j] = source[j]; } out: return ret; } static int read_table_ragged_cols(kastore_t *store, tsk_size_t *num_rows, read_table_ragged_col_t *cols, tsk_flags_t TSK_UNUSED(flags)) { int ret = 0; size_t data_len = 0; // initial value unused, just to keep the compiler happy. size_t offset_len; int type; read_table_ragged_col_t *col; char offset_col_name[TSK_MAX_COL_NAME_LEN]; bool data_col_present, offset_col_present; void *store_offset_array = NULL; tsk_size_t *offset_array; for (col = cols; col->name != NULL; col++) { ret = kastore_containss(store, col->name); if (ret < 0) { ret = tsk_set_kas_error(ret); goto out; } data_col_present = false; if (ret == 1) { ret = kastore_gets(store, col->name, col->data_array_dest, &data_len, &type); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } if (type != col->data_type) { ret = tsk_trace_error(TSK_ERR_BAD_COLUMN_TYPE); goto out; } *col->data_len_dest = (tsk_size_t) data_len; data_col_present = true; } else if (!(col->options & TSK_COL_OPTIONAL)) { ret = tsk_trace_error(TSK_ERR_REQUIRED_COL_NOT_FOUND); goto out; } assert(strlen(col->name) + strlen("_offset") + 2 < sizeof(offset_col_name)); strcpy(offset_col_name, col->name); strcat(offset_col_name, "_offset"); ret = kastore_containss(store, offset_col_name); if (ret < 0) { ret = tsk_set_kas_error(ret); goto out; } offset_col_present = ret == 1; if (offset_col_present != data_col_present) { ret = tsk_trace_error(TSK_ERR_BOTH_COLUMNS_REQUIRED); goto out; } if (offset_col_present) { ret = kastore_gets( store, offset_col_name, &store_offset_array, &offset_len, &type); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } /* A table with zero rows will still have an offset length of 1; * catching this here prevents underflows in the logic below */ if (offset_len == 0) { ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); goto out; } /* Some tables have only ragged columns */ if (*num_rows == TSK_NUM_ROWS_UNSET) { *num_rows = (tsk_size_t) offset_len - 1; } else { if (*num_rows != (tsk_size_t) offset_len - 1) { ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); goto out; } } if (type == KAS_UINT64) { *col->offset_array_dest = (uint64_t *) store_offset_array; store_offset_array = NULL; } else if (type == KAS_UINT32) { ret = cast_offset_array(col, (uint32_t *) store_offset_array, *num_rows); if (ret != 0) { goto out; } tsk_safe_free(store_offset_array); store_offset_array = NULL; } else { ret = tsk_trace_error(TSK_ERR_BAD_COLUMN_TYPE); goto out; } offset_array = *col->offset_array_dest; if (offset_array[*num_rows] != (tsk_size_t) data_len) { ret = tsk_trace_error(TSK_ERR_BAD_OFFSET); goto out; } } } out: tsk_safe_free(store_offset_array); return ret; } static int read_table_properties( kastore_t *store, read_table_property_t *properties, tsk_flags_t TSK_UNUSED(flags)) { int ret = 0; size_t len; int type; read_table_property_t *property; for (property = properties; property->name != NULL; property++) { ret = kastore_containss(store, property->name); if (ret < 0) { ret = tsk_set_kas_error(ret); goto out; } if (ret == 1) { ret = kastore_gets(store, property->name, property->array_dest, &len, &type); if (ret != 0) { ret = tsk_set_kas_error(ret); assert(ret != 0); /* Tell static analysers that we're handling errors */ goto out; } if (type != property->type) { ret = tsk_trace_error(TSK_ERR_BAD_COLUMN_TYPE); goto out; } *property->len_dest = (tsk_size_t) len; } assert(property->options & TSK_COL_OPTIONAL); } out: return ret; } static int read_table(kastore_t *store, tsk_size_t *num_rows, read_table_col_t *cols, read_table_ragged_col_t *ragged_cols, read_table_property_t *properties, tsk_flags_t options) { int ret = 0; *num_rows = TSK_NUM_ROWS_UNSET; if (cols != NULL) { ret = read_table_cols(store, num_rows, cols, options); if (ret != 0) { goto out; } } if (ragged_cols != NULL) { ret = read_table_ragged_cols(store, num_rows, ragged_cols, options); if (ret != 0) { goto out; } } if (*num_rows == TSK_NUM_ROWS_UNSET) { ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); goto out; } if (properties != NULL) { ret = read_table_properties(store, properties, options); if (ret != 0) { goto out; } } out: return ret; } static void free_read_table_mem(read_table_col_t *cols, read_table_ragged_col_t *ragged_cols, read_table_property_t *properties) { read_table_col_t *col; read_table_ragged_col_t *ragged_col; read_table_property_t *property; if (cols != NULL) { for (col = cols; col->name != NULL; col++) { tsk_safe_free(*(col->array_dest)); } } if (ragged_cols != NULL) { for (ragged_col = ragged_cols; ragged_col->name != NULL; ragged_col++) { tsk_safe_free(*(ragged_col->data_array_dest)); tsk_safe_free(*(ragged_col->offset_array_dest)); } } if (properties != NULL) { for (property = properties; property->name != NULL; property++) { tsk_safe_free(*(property->array_dest)); } } } static int write_offset_col( kastore_t *store, const write_table_ragged_col_t *col, tsk_flags_t options) { int ret = 0; char offset_col_name[TSK_MAX_COL_NAME_LEN]; uint32_t *offset32 = NULL; tsk_size_t len = col->num_rows + 1; tsk_size_t j; int32_t put_flags = 0; int type; const void *data; bool needs_64 = col->offset_array[col->num_rows] > UINT32_MAX; assert(strlen(col->name) + strlen("_offset") + 2 < sizeof(offset_col_name)); strcpy(offset_col_name, col->name); strcat(offset_col_name, "_offset"); if (options & TSK_DUMP_FORCE_OFFSET_64 || needs_64) { type = KAS_UINT64; data = col->offset_array; put_flags = KAS_BORROWS_ARRAY; } else { offset32 = tsk_malloc(len * sizeof(*offset32)); if (offset32 == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < len; j++) { offset32[j] = (uint32_t) col->offset_array[j]; } type = KAS_UINT32; data = offset32; /* We've just allocated a temp buffer, so kas can't borrow so leave put_flags=0*/ } ret = kastore_puts(store, offset_col_name, data, (size_t) len, type, put_flags); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } out: tsk_safe_free(offset32); return ret; } static int write_table_ragged_cols( kastore_t *store, const write_table_ragged_col_t *write_cols, tsk_flags_t options) { int ret = 0; const write_table_ragged_col_t *col; for (col = write_cols; col->name != NULL; col++) { ret = kastore_puts(store, col->name, col->data_array, (size_t) col->data_len, col->data_type, KAS_BORROWS_ARRAY); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } ret = write_offset_col(store, col, options); if (ret != 0) { goto out; } } out: return ret; } static int write_table_cols(kastore_t *store, const write_table_col_t *write_cols, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; const write_table_col_t *col; for (col = write_cols; col->name != NULL; col++) { ret = kastore_puts(store, col->name, col->array, (size_t) col->len, col->type, KAS_BORROWS_ARRAY); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } } out: return ret; } static int write_table(kastore_t *store, const write_table_col_t *cols, const write_table_ragged_col_t *ragged_cols, tsk_flags_t options) { int ret = write_table_cols(store, cols, options); if (ret != 0) { goto out; } ret = write_table_ragged_cols(store, ragged_cols, options); if (ret != 0) { goto out; } out: return ret; } /* Checks that the specified list of offsets is well-formed. */ static int check_offsets( tsk_size_t num_rows, const tsk_size_t *offsets, tsk_size_t length, bool check_length) { int ret = 0; tsk_size_t j; if (offsets[0] != 0) { ret = tsk_trace_error(TSK_ERR_BAD_OFFSET); goto out; } if (check_length && offsets[num_rows] != length) { ret = tsk_trace_error(TSK_ERR_BAD_OFFSET); goto out; } for (j = 0; j < num_rows; j++) { if (offsets[j] > offsets[j + 1]) { ret = tsk_trace_error(TSK_ERR_BAD_OFFSET); goto out; } } ret = 0; out: return ret; } static int calculate_max_rows(tsk_size_t num_rows, tsk_size_t max_rows, tsk_size_t max_rows_increment, tsk_size_t additional_rows, tsk_size_t *ret_new_max_rows) { tsk_size_t new_max_rows; int ret = 0; if (check_table_overflow(num_rows, additional_rows)) { ret = tsk_trace_error(TSK_ERR_TABLE_OVERFLOW); goto out; } if (num_rows + additional_rows <= max_rows) { new_max_rows = max_rows; } else { if (max_rows_increment == 0) { /* Doubling by default */ new_max_rows = TSK_MIN(max_rows * 2, TSK_MAX_ID + (tsk_size_t) 1); /* Add some constraints to prevent very small allocations */ if (new_max_rows < 1024) { new_max_rows = 1024; } /* Prevent allocating more than ~2 million additional rows unless needed*/ if (new_max_rows - max_rows > 2097152) { new_max_rows = max_rows + 2097152; } } else { /* Use user increment value */ if (check_table_overflow(max_rows, max_rows_increment)) { ret = tsk_trace_error(TSK_ERR_TABLE_OVERFLOW); goto out; } new_max_rows = max_rows + max_rows_increment; } new_max_rows = TSK_MAX(new_max_rows, num_rows + additional_rows); } *ret_new_max_rows = new_max_rows; out: return ret; } static int calculate_max_length(tsk_size_t current_length, tsk_size_t max_length, tsk_size_t max_length_increment, tsk_size_t additional_length, tsk_size_t *ret_new_max_length) { tsk_size_t new_max_length; int ret = 0; if (check_offset_overflow(current_length, additional_length)) { ret = tsk_trace_error(TSK_ERR_COLUMN_OVERFLOW); goto out; } if (current_length + additional_length <= max_length) { new_max_length = max_length; } else { if (max_length_increment == 0) { /* Doubling by default */ new_max_length = TSK_MIN(max_length * 2, TSK_MAX_SIZE); /* Add some constraints to prevent very small allocations */ if (new_max_length < 65536) { new_max_length = 65536; } /* Prevent allocating more than 100MB additional unless needed*/ if (new_max_length - max_length > 104857600) { new_max_length = max_length + 104857600; } new_max_length = TSK_MAX(new_max_length, current_length + additional_length); } else { /* Use user increment value */ if (check_offset_overflow(max_length, max_length_increment)) { /* Here we could allocate to the maximum size. * Instead we are erroring out as this is much easier to test. * The cost is that (at most) the last "max_length_increment"-1 * bytes of the possible array space can't be used. */ ret = tsk_trace_error(TSK_ERR_COLUMN_OVERFLOW); goto out; } new_max_length = max_length + max_length_increment; } new_max_length = TSK_MAX(new_max_length, current_length + additional_length); } *ret_new_max_length = new_max_length; out: return ret; } static int expand_column(void **column, tsk_size_t new_max_rows, size_t element_size) { int ret = 0; void *tmp; tmp = tsk_realloc((void **) *column, new_max_rows * element_size); if (tmp == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } *column = tmp; out: return ret; } static int expand_ragged_column(tsk_size_t current_length, tsk_size_t additional_length, tsk_size_t max_length_increment, tsk_size_t *max_length, void **column, size_t element_size) { int ret = 0; tsk_size_t new_max_length; ret = calculate_max_length(current_length, *max_length, max_length_increment, additional_length, &new_max_length); if (ret != 0) { goto out; } if (new_max_length > *max_length) { ret = expand_column(column, new_max_length, element_size); if (ret != 0) { goto out; } *max_length = new_max_length; } out: return ret; } /* TODO rename to copy_string or replace_and_copy_string */ static int replace_string( char **str, tsk_size_t *len, const char *new_str, const tsk_size_t new_len) { int ret = 0; tsk_safe_free(*str); *str = NULL; *len = new_len; if (new_len > 0) { *str = tsk_malloc(new_len * sizeof(char)); if (*str == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memcpy(*str, new_str, new_len * sizeof(char)); } out: return ret; } static int takeset_string(char **str, tsk_size_t *len, char *new_str, const tsk_size_t new_len) { tsk_safe_free(*str); *str = new_str; *len = new_len; return 0; } static int alloc_empty_ragged_column(tsk_size_t num_rows, void **data_col, tsk_size_t **offset_col) { int ret = 0; *data_col = tsk_malloc(1); *offset_col = tsk_calloc(num_rows + 1, sizeof(tsk_size_t)); if (*data_col == NULL || *offset_col == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } out: return ret; } static int check_ragged_column(tsk_size_t num_rows, void *data, tsk_size_t *offset) { int ret = 0; if ((data == NULL) != (offset == NULL)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if (data != NULL) { ret = check_offsets(num_rows, offset, 0, false); if (ret != 0) { goto out; } } out: return ret; } static int takeset_ragged_column(tsk_size_t num_rows, void *data, tsk_size_t *offset, void **data_dest, tsk_size_t **offset_dest, tsk_size_t *length_dest) { int ret = 0; if (data == NULL) { ret = alloc_empty_ragged_column(num_rows, (void *) data_dest, offset_dest); if (ret != 0) { goto out; } } else { *data_dest = data; *offset_dest = offset; } *length_dest = (*offset_dest)[num_rows]; out: return ret; } static int takeset_optional_id_column(tsk_size_t num_rows, tsk_id_t *input, tsk_id_t **dest) { int ret = 0; tsk_size_t buffsize; tsk_id_t *buff; if (input == NULL) { buffsize = num_rows * sizeof(*buff); buff = tsk_malloc(buffsize); if (buff == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } *dest = buff; tsk_memset(buff, 0xff, buffsize); } else { *dest = input; } out: return ret; } static int write_metadata_schema_header( FILE *out, const char *metadata_schema, tsk_size_t metadata_schema_length) { const char *fmt = "#metadata_schema#\n" "%.*s\n" "#end#metadata_schema\n" TABLE_SEP; return fprintf(out, fmt, (int) metadata_schema_length, metadata_schema); } /* Utilities for in-place subsetting columns */ static tsk_size_t count_true(tsk_size_t num_rows, const tsk_bool_t *restrict keep) { tsk_size_t j; tsk_size_t count = 0; for (j = 0; j < num_rows; j++) { if (keep[j]) { count++; } } return count; } static void keep_mask_to_id_map( tsk_size_t num_rows, const tsk_bool_t *restrict keep, tsk_id_t *restrict id_map) { tsk_size_t j; tsk_id_t next_id = 0; for (j = 0; j < num_rows; j++) { id_map[j] = TSK_NULL; if (keep[j]) { id_map[j] = next_id; next_id++; } } } static tsk_size_t subset_remap_id_column(tsk_id_t *restrict column, tsk_size_t num_rows, const tsk_bool_t *restrict keep, const tsk_id_t *restrict id_map) { tsk_size_t j, k; tsk_id_t value; k = 0; for (j = 0; j < num_rows; j++) { if (keep[j]) { value = column[j]; if (value != TSK_NULL) { value = id_map[value]; } column[k] = value; k++; } } return k; } /* Trigger warning: C++ programmers should look away... This may be one of the * few cases where some macro funkiness is warranted, as these are exact * duplicates of the same function with just the type of the column * parameter changed. */ static tsk_size_t subset_id_column( tsk_id_t *restrict column, tsk_size_t num_rows, const tsk_bool_t *restrict keep) { tsk_size_t j, k; k = 0; for (j = 0; j < num_rows; j++) { if (keep[j]) { column[k] = column[j]; k++; } } return k; } static tsk_size_t subset_flags_column( tsk_flags_t *restrict column, tsk_size_t num_rows, const tsk_bool_t *restrict keep) { tsk_size_t j, k; k = 0; for (j = 0; j < num_rows; j++) { if (keep[j]) { column[k] = column[j]; k++; } } return k; } static tsk_size_t subset_double_column( double *restrict column, tsk_size_t num_rows, const tsk_bool_t *restrict keep) { tsk_size_t j, k; k = 0; for (j = 0; j < num_rows; j++) { if (keep[j]) { column[k] = column[j]; k++; } } return k; } static tsk_size_t subset_ragged_char_column(char *restrict data, tsk_size_t *restrict offset_col, tsk_size_t num_rows, const tsk_bool_t *restrict keep) { tsk_size_t j, k, i, offset; k = 0; offset = 0; for (j = 0; j < num_rows; j++) { if (keep[j]) { offset_col[k] = offset; /* Note: Unclear whether it's worth calling memcpy instead here? * Need to be careful since the regions are overlapping */ for (i = offset_col[j]; i < offset_col[j + 1]; i++) { data[offset] = data[i]; offset++; } k++; } } offset_col[k] = offset; return offset; } static tsk_size_t subset_ragged_double_column(double *restrict data, tsk_size_t *restrict offset_col, tsk_size_t num_rows, const tsk_bool_t *restrict keep) { tsk_size_t j, k, i, offset; k = 0; offset = 0; for (j = 0; j < num_rows; j++) { if (keep[j]) { offset_col[k] = offset; /* Note: Unclear whether it's worth calling memcpy instead here? * Need to be careful since the regions are overlapping */ for (i = offset_col[j]; i < offset_col[j + 1]; i++) { data[offset] = data[i]; offset++; } k++; } } offset_col[k] = offset; return offset; } static tsk_size_t subset_remap_ragged_id_column(tsk_id_t *restrict data, tsk_size_t *restrict offset_col, tsk_size_t num_rows, const tsk_bool_t *restrict keep, const tsk_id_t *restrict id_map) { tsk_size_t j, k, i, offset; tsk_id_t di; k = 0; offset = 0; for (j = 0; j < num_rows; j++) { if (keep[j]) { offset_col[k] = offset; for (i = offset_col[j]; i < offset_col[j + 1]; i++) { di = data[i]; if (di != TSK_NULL) { di = id_map[di]; } data[offset] = di; offset++; } k++; } } offset_col[k] = offset; return offset; } /************************* * reference sequence *************************/ int tsk_reference_sequence_init( tsk_reference_sequence_t *self, tsk_flags_t TSK_UNUSED(options)) { tsk_memset(self, 0, sizeof(*self)); return 0; } int tsk_reference_sequence_free(tsk_reference_sequence_t *self) { tsk_safe_free(self->data); tsk_safe_free(self->url); tsk_safe_free(self->metadata); tsk_safe_free(self->metadata_schema); return 0; } bool tsk_reference_sequence_is_null(const tsk_reference_sequence_t *self) { return self->data_length == 0 && self->url_length == 0 && self->metadata_length == 0 && self->metadata_schema_length == 0; } bool tsk_reference_sequence_equals(const tsk_reference_sequence_t *self, const tsk_reference_sequence_t *other, tsk_flags_t options) { int ret = self->data_length == other->data_length && self->url_length == other->url_length && tsk_memcmp(self->data, other->data, self->data_length * sizeof(char)) == 0 && tsk_memcmp(self->url, other->url, self->url_length * sizeof(char)) == 0; if (!(options & TSK_CMP_IGNORE_METADATA)) { ret = ret && self->metadata_length == other->metadata_length && self->metadata_schema_length == other->metadata_schema_length && tsk_memcmp(self->metadata, other->metadata, self->metadata_length * sizeof(char)) == 0 && tsk_memcmp(self->metadata_schema, other->metadata_schema, self->metadata_schema_length * sizeof(char)) == 0; } return ret; } int tsk_reference_sequence_copy(const tsk_reference_sequence_t *self, tsk_reference_sequence_t *dest, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_reference_sequence_init(dest, 0); if (ret != 0) { goto out; } } if (tsk_reference_sequence_is_null(self)) { /* This is a simple way to get any input into the NULL state */ tsk_reference_sequence_free(dest); } else { ret = tsk_reference_sequence_set_data(dest, self->data, self->data_length); if (ret != 0) { goto out; } ret = tsk_reference_sequence_set_url(dest, self->url, self->url_length); if (ret != 0) { goto out; } ret = tsk_reference_sequence_set_metadata( dest, self->metadata, self->metadata_length); if (ret != 0) { goto out; } ret = tsk_reference_sequence_set_metadata_schema( dest, self->metadata_schema, self->metadata_schema_length); if (ret != 0) { goto out; } } out: return ret; } int tsk_reference_sequence_set_data( tsk_reference_sequence_t *self, const char *data, tsk_size_t data_length) { return replace_string(&self->data, &self->data_length, data, data_length); } int tsk_reference_sequence_set_url( tsk_reference_sequence_t *self, const char *url, tsk_size_t url_length) { return replace_string(&self->url, &self->url_length, url, url_length); } int tsk_reference_sequence_set_metadata( tsk_reference_sequence_t *self, const char *metadata, tsk_size_t metadata_length) { return replace_string( &self->metadata, &self->metadata_length, metadata, metadata_length); } int tsk_reference_sequence_set_metadata_schema(tsk_reference_sequence_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length) { return replace_string(&self->metadata_schema, &self->metadata_schema_length, metadata_schema, metadata_schema_length); } int tsk_reference_sequence_takeset_data( tsk_reference_sequence_t *self, char *data, tsk_size_t data_length) { return takeset_string(&self->data, &self->data_length, data, data_length); } int tsk_reference_sequence_takeset_metadata( tsk_reference_sequence_t *self, char *metadata, tsk_size_t metadata_length) { return takeset_string( &self->metadata, &self->metadata_length, metadata, metadata_length); } /************************* * individual table *************************/ static void tsk_individual_table_free_columns(tsk_individual_table_t *self) { tsk_safe_free(self->flags); tsk_safe_free(self->location); tsk_safe_free(self->location_offset); tsk_safe_free(self->parents); tsk_safe_free(self->parents_offset); tsk_safe_free(self->metadata); tsk_safe_free(self->metadata_offset); } int tsk_individual_table_free(tsk_individual_table_t *self) { tsk_individual_table_free_columns(self); tsk_safe_free(self->metadata_schema); return 0; } static int tsk_individual_table_expand_main_columns( tsk_individual_table_t *self, tsk_size_t additional_rows) { int ret = 0; tsk_size_t new_max_rows; ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, additional_rows, &new_max_rows); if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { ret = expand_column((void **) &self->flags, new_max_rows, sizeof(tsk_flags_t)); if (ret != 0) { goto out; } ret = expand_column( (void **) &self->location_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } ret = expand_column( (void **) &self->parents_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } ret = expand_column( (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } self->max_rows = new_max_rows; } out: return ret; } static int tsk_individual_table_expand_location( tsk_individual_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->location_length, additional_length, self->max_location_length_increment, &self->max_location_length, (void **) &self->location, sizeof(*self->location)); } static int tsk_individual_table_expand_parents( tsk_individual_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->parents_length, additional_length, self->max_parents_length_increment, &self->max_parents_length, (void **) &self->parents, sizeof(*self->parents)); } static int tsk_individual_table_expand_metadata( tsk_individual_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->metadata_length, additional_length, self->max_metadata_length_increment, &self->max_metadata_length, (void **) &self->metadata, sizeof(*self->metadata)); } int tsk_individual_table_set_max_rows_increment( tsk_individual_table_t *self, tsk_size_t max_rows_increment) { self->max_rows_increment = max_rows_increment; return 0; } int tsk_individual_table_set_max_metadata_length_increment( tsk_individual_table_t *self, tsk_size_t max_metadata_length_increment) { self->max_metadata_length_increment = (tsk_size_t) max_metadata_length_increment; return 0; } int tsk_individual_table_set_max_location_length_increment( tsk_individual_table_t *self, tsk_size_t max_location_length_increment) { self->max_location_length_increment = (tsk_size_t) max_location_length_increment; return 0; } int tsk_individual_table_set_max_parents_length_increment( tsk_individual_table_t *self, tsk_size_t max_parents_length_increment) { self->max_parents_length_increment = (tsk_size_t) max_parents_length_increment; return 0; } int tsk_individual_table_init(tsk_individual_table_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_memset(self, 0, sizeof(tsk_individual_table_t)); /* Allocate space for one row initially, ensuring we always have valid pointers * even if the table is empty */ self->max_rows_increment = 1; self->max_location_length_increment = 1; self->max_parents_length_increment = 1; self->max_metadata_length_increment = 1; ret = tsk_individual_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_individual_table_expand_location(self, 1); if (ret != 0) { goto out; } self->location_offset[0] = 0; ret = tsk_individual_table_expand_parents(self, 1); if (ret != 0) { goto out; } self->parents_offset[0] = 0; ret = tsk_individual_table_expand_metadata(self, 1); if (ret != 0) { goto out; } self->metadata_offset[0] = 0; self->max_rows_increment = 0; self->max_location_length_increment = 0; self->max_parents_length_increment = 0; self->max_metadata_length_increment = 0; tsk_individual_table_set_metadata_schema(self, NULL, 0); out: return ret; } int TSK_WARN_UNUSED tsk_individual_table_copy(const tsk_individual_table_t *self, tsk_individual_table_t *dest, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_individual_table_init(dest, 0); if (ret != 0) { goto out; } } ret = tsk_individual_table_set_columns(dest, self->num_rows, self->flags, self->location, self->location_offset, self->parents, self->parents_offset, self->metadata, self->metadata_offset); if (ret != 0) { goto out; } ret = tsk_individual_table_set_metadata_schema( dest, self->metadata_schema, self->metadata_schema_length); out: return ret; } int TSK_WARN_UNUSED tsk_individual_table_set_columns(tsk_individual_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *location, const tsk_size_t *location_offset, const tsk_id_t *parents, const tsk_size_t *parents_offset, const char *metadata, const tsk_size_t *metadata_offset) { int ret; ret = tsk_individual_table_clear(self); if (ret != 0) { goto out; } ret = tsk_individual_table_append_columns(self, num_rows, flags, location, location_offset, parents, parents_offset, metadata, metadata_offset); out: return ret; } int TSK_WARN_UNUSED tsk_individual_table_takeset_columns(tsk_individual_table_t *self, tsk_size_t num_rows, tsk_flags_t *flags, double *location, tsk_size_t *location_offset, tsk_id_t *parents, tsk_size_t *parents_offset, char *metadata, tsk_size_t *metadata_offset) { int ret = 0; /* We need to check all the inputs before we start freeing or taking memory */ ret = check_ragged_column(num_rows, location, location_offset); if (ret != 0) { goto out; } ret = check_ragged_column(num_rows, parents, parents_offset); if (ret != 0) { goto out; } ret = check_ragged_column(num_rows, metadata, metadata_offset); if (ret != 0) { goto out; } tsk_individual_table_free_columns(self); self->num_rows = num_rows; self->max_rows = num_rows; if (flags == NULL) { /* Flags defaults to all zeros if not specified. The column is often * unused so this is a worthwhile optimisation. */ self->flags = tsk_calloc(num_rows, sizeof(*self->flags)); if (self->flags == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } } else { self->flags = flags; } ret = takeset_ragged_column(num_rows, location, location_offset, (void *) &self->location, &self->location_offset, &self->location_length); if (ret != 0) { goto out; } ret = takeset_ragged_column(num_rows, parents, parents_offset, (void *) &self->parents, &self->parents_offset, &self->parents_length); if (ret != 0) { goto out; } ret = takeset_ragged_column(num_rows, metadata, metadata_offset, (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); if (ret != 0) { goto out; } out: return ret; } int tsk_individual_table_append_columns(tsk_individual_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *location, const tsk_size_t *location_offset, const tsk_id_t *parents, const tsk_size_t *parents_offset, const char *metadata, const tsk_size_t *metadata_offset) { int ret; tsk_size_t j, metadata_length, location_length, parents_length; if (flags == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if ((location == NULL) != (location_offset == NULL)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if ((parents == NULL) != (parents_offset == NULL)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if ((metadata == NULL) != (metadata_offset == NULL)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = tsk_individual_table_expand_main_columns(self, (tsk_size_t) num_rows); if (ret != 0) { goto out; } tsk_memcpy(self->flags + self->num_rows, flags, num_rows * sizeof(tsk_flags_t)); if (location == NULL) { for (j = 0; j < num_rows; j++) { self->location_offset[self->num_rows + j + 1] = (tsk_size_t) self->location_length; } } else { ret = check_offsets(num_rows, location_offset, 0, false); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { self->location_offset[self->num_rows + j] = (tsk_size_t) self->location_length + location_offset[j]; } location_length = location_offset[num_rows]; ret = tsk_individual_table_expand_location(self, location_length); if (ret != 0) { goto out; } tsk_memcpy(self->location + self->location_length, location, location_length * sizeof(double)); self->location_length += location_length; } if (parents == NULL) { for (j = 0; j < num_rows; j++) { self->parents_offset[self->num_rows + j + 1] = (tsk_size_t) self->parents_length; } } else { ret = check_offsets(num_rows, parents_offset, 0, false); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { self->parents_offset[self->num_rows + j] = (tsk_size_t) self->parents_length + parents_offset[j]; } parents_length = parents_offset[num_rows]; ret = tsk_individual_table_expand_parents(self, parents_length); if (ret != 0) { goto out; } tsk_memcpy(self->parents + self->parents_length, parents, parents_length * sizeof(tsk_id_t)); self->parents_length += parents_length; } if (metadata == NULL) { for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j + 1] = (tsk_size_t) self->metadata_length; } } else { ret = check_offsets(num_rows, metadata_offset, 0, false); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j] = (tsk_size_t) self->metadata_length + metadata_offset[j]; } metadata_length = metadata_offset[num_rows]; ret = tsk_individual_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } tsk_memcpy(self->metadata + self->metadata_length, metadata, metadata_length * sizeof(char)); self->metadata_length += metadata_length; } self->num_rows += (tsk_size_t) num_rows; self->location_offset[self->num_rows] = self->location_length; self->parents_offset[self->num_rows] = self->parents_length; self->metadata_offset[self->num_rows] = self->metadata_length; out: return ret; } static tsk_id_t tsk_individual_table_add_row_internal(tsk_individual_table_t *self, tsk_flags_t flags, const double *location, tsk_size_t location_length, const tsk_id_t *parents, const tsk_size_t parents_length, const char *metadata, tsk_size_t metadata_length) { tsk_bug_assert(self->num_rows < self->max_rows); tsk_bug_assert(self->parents_length + parents_length <= self->max_parents_length); tsk_bug_assert(self->metadata_length + metadata_length <= self->max_metadata_length); tsk_bug_assert(self->location_length + location_length <= self->max_location_length); self->flags[self->num_rows] = flags; tsk_memmove(self->location + self->location_length, location, location_length * sizeof(*self->location)); self->location_offset[self->num_rows + 1] = self->location_length + location_length; self->location_length += location_length; tsk_memmove(self->parents + self->parents_length, parents, parents_length * sizeof(*self->parents)); self->parents_offset[self->num_rows + 1] = self->parents_length + parents_length; self->parents_length += parents_length; tsk_memmove(self->metadata + self->metadata_length, metadata, metadata_length * sizeof(*self->metadata)); self->metadata_offset[self->num_rows + 1] = self->metadata_length + metadata_length; self->metadata_length += metadata_length; self->num_rows++; return (tsk_id_t) self->num_rows - 1; } tsk_id_t tsk_individual_table_add_row(tsk_individual_table_t *self, tsk_flags_t flags, const double *location, tsk_size_t location_length, const tsk_id_t *parents, tsk_size_t parents_length, const char *metadata, tsk_size_t metadata_length) { tsk_id_t ret = 0; ret = tsk_individual_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_individual_table_expand_location(self, location_length); if (ret != 0) { goto out; } ret = tsk_individual_table_expand_parents(self, parents_length); if (ret != 0) { goto out; } ret = tsk_individual_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } ret = tsk_individual_table_add_row_internal(self, flags, location, location_length, parents, parents_length, metadata, metadata_length); out: return ret; } static int tsk_individual_table_update_row_rewrite(tsk_individual_table_t *self, tsk_id_t index, tsk_flags_t flags, const double *location, tsk_size_t location_length, const tsk_id_t *parents, tsk_size_t parents_length, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_id_t j, ret_id; tsk_individual_table_t copy; tsk_size_t num_rows; tsk_id_t *rows = NULL; ret = tsk_individual_table_copy(self, ©, 0); if (ret != 0) { goto out; } rows = tsk_malloc(self->num_rows * sizeof(*rows)); if (rows == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_individual_table_truncate(self, (tsk_size_t) index); tsk_bug_assert(ret == 0); ret_id = tsk_individual_table_add_row(self, flags, location, location_length, parents, parents_length, metadata, metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } num_rows = 0; for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { rows[num_rows] = j; num_rows++; } ret = tsk_individual_table_extend(self, ©, num_rows, rows, 0); if (ret != 0) { goto out; } out: tsk_individual_table_free(©); tsk_safe_free(rows); return ret; } int tsk_individual_table_update_row(tsk_individual_table_t *self, tsk_id_t index, tsk_flags_t flags, const double *location, tsk_size_t location_length, const tsk_id_t *parents, tsk_size_t parents_length, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_individual_t current_row; ret = tsk_individual_table_get_row(self, index, ¤t_row); if (ret != 0) { goto out; } if (current_row.location_length == location_length && current_row.parents_length == parents_length && current_row.metadata_length == metadata_length) { self->flags[index] = flags; /* Note: important to use tsk_memmove here as we may be provided pointers * to the column memory as input via get_row */ tsk_memmove(&self->location[self->location_offset[index]], location, location_length * sizeof(*location)); tsk_memmove(&self->parents[self->parents_offset[index]], parents, parents_length * sizeof(*parents)); tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, metadata_length * sizeof(*metadata)); } else { ret = tsk_individual_table_update_row_rewrite(self, index, flags, location, location_length, parents, parents_length, metadata, metadata_length); if (ret != 0) { goto out; } } out: return ret; } int tsk_individual_table_clear(tsk_individual_table_t *self) { return tsk_individual_table_truncate(self, 0); } int tsk_individual_table_truncate(tsk_individual_table_t *self, tsk_size_t num_rows) { int ret = 0; if (num_rows > self->num_rows) { ret = tsk_trace_error(TSK_ERR_BAD_TABLE_POSITION); goto out; } self->num_rows = num_rows; self->location_length = self->location_offset[num_rows]; self->parents_length = self->parents_offset[num_rows]; self->metadata_length = self->metadata_offset[num_rows]; out: return ret; } int tsk_individual_table_extend(tsk_individual_table_t *self, const tsk_individual_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_individual_t individual; if (self == other) { ret = tsk_trace_error(TSK_ERR_CANNOT_EXTEND_FROM_SELF); goto out; } /* We know how much to expand the non-ragged columns, so do it ahead of time */ ret = tsk_individual_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { ret = tsk_individual_table_get_row( other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &individual); if (ret != 0) { goto out; } ret_id = tsk_individual_table_add_row(self, individual.flags, individual.location, individual.location_length, individual.parents, individual.parents_length, individual.metadata, individual.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; out: return ret; } void tsk_individual_table_print_state(const tsk_individual_table_t *self, FILE *out) { tsk_size_t j, k; fprintf(out, "\n" TABLE_SEP); fprintf(out, "tsk_individual_tbl: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, (long long) self->max_rows_increment); fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->metadata_length, (long long) self->max_metadata_length, (long long) self->max_metadata_length_increment); fprintf(out, TABLE_SEP); /* We duplicate the dump_text code here because we want to output * the offset columns. */ write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); fprintf(out, "id\tflags\tlocation_offset\tlocation\t"); fprintf(out, "parents_offset\tparents\t"); fprintf(out, "metadata_offset\tmetadata\n"); for (j = 0; j < self->num_rows; j++) { fprintf(out, "%lld\t%lld\t", (long long) j, (long long) self->flags[j]); fprintf(out, "%lld\t", (long long) self->location_offset[j]); for (k = self->location_offset[j]; k < self->location_offset[j + 1]; k++) { fprintf(out, "%f", self->location[k]); if (k + 1 < self->location_offset[j + 1]) { fprintf(out, ","); } } fprintf(out, "\t"); fprintf(out, "%lld\t", (long long) self->parents_offset[j]); for (k = self->parents_offset[j]; k < self->parents_offset[j + 1]; k++) { fprintf(out, "%lld", (long long) self->parents[k]); if (k + 1 < self->parents_offset[j + 1]) { fprintf(out, ","); } } fprintf(out, "\t"); fprintf(out, "%lld\t", (long long) self->metadata_offset[j]); for (k = self->metadata_offset[j]; k < self->metadata_offset[j + 1]; k++) { fprintf(out, "%c", self->metadata[k]); } fprintf(out, "\n"); } } static inline void tsk_individual_table_get_row_unsafe( const tsk_individual_table_t *self, tsk_id_t index, tsk_individual_t *row) { row->id = (tsk_id_t) index; row->flags = self->flags[index]; row->location_length = self->location_offset[index + 1] - self->location_offset[index]; row->location = self->location + self->location_offset[index]; row->parents_length = self->parents_offset[index + 1] - self->parents_offset[index]; row->parents = self->parents + self->parents_offset[index]; row->metadata_length = self->metadata_offset[index + 1] - self->metadata_offset[index]; row->metadata = self->metadata + self->metadata_offset[index]; /* Also have referencing individuals here. Should this be a different struct? * See also site. */ row->nodes_length = 0; row->nodes = NULL; } int tsk_individual_table_get_row( const tsk_individual_table_t *self, tsk_id_t index, tsk_individual_t *row) { int ret = 0; if (index < 0 || index >= (tsk_id_t) self->num_rows) { ret = tsk_trace_error(TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); goto out; } tsk_individual_table_get_row_unsafe(self, index, row); out: return ret; } int tsk_individual_table_set_metadata_schema(tsk_individual_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length) { return replace_string(&self->metadata_schema, &self->metadata_schema_length, metadata_schema, metadata_schema_length); } int tsk_individual_table_dump_text(const tsk_individual_table_t *self, FILE *out) { int ret = TSK_ERR_IO; tsk_size_t j, k; tsk_size_t metadata_len; int err; err = write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); if (err < 0) { goto out; } err = fprintf(out, "id\tflags\tlocation\tparents\tmetadata\n"); if (err < 0) { goto out; } for (j = 0; j < self->num_rows; j++) { metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; err = fprintf(out, "%lld\t%lld\t", (long long) j, (long long) self->flags[j]); if (err < 0) { goto out; } for (k = self->location_offset[j]; k < self->location_offset[j + 1]; k++) { err = fprintf(out, "%.*g", TSK_DBL_DECIMAL_DIG, self->location[k]); if (err < 0) { goto out; } if (k + 1 < self->location_offset[j + 1]) { err = fprintf(out, ","); if (err < 0) { goto out; } } } err = fprintf(out, "\t"); if (err < 0) { goto out; } for (k = self->parents_offset[j]; k < self->parents_offset[j + 1]; k++) { err = fprintf(out, "%lld", (long long) self->parents[k]); if (err < 0) { goto out; } if (k + 1 < self->parents_offset[j + 1]) { err = fprintf(out, ","); if (err < 0) { goto out; } } } err = fprintf(out, "\t%.*s\n", (int) metadata_len, self->metadata + self->metadata_offset[j]); if (err < 0) { goto out; } } ret = 0; out: return ret; } bool tsk_individual_table_equals(const tsk_individual_table_t *self, const tsk_individual_table_t *other, tsk_flags_t options) { bool ret = self->num_rows == other->num_rows && tsk_memcmp(self->flags, other->flags, self->num_rows * sizeof(tsk_flags_t)) == 0 && tsk_memcmp(self->location_offset, other->location_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp( self->location, other->location, self->location_length * sizeof(double)) == 0 && tsk_memcmp(self->parents_offset, other->parents_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp( self->parents, other->parents, self->parents_length * sizeof(tsk_id_t)) == 0; if (!(options & TSK_CMP_IGNORE_METADATA)) { ret = ret && self->metadata_length == other->metadata_length && self->metadata_schema_length == other->metadata_schema_length && tsk_memcmp(self->metadata_offset, other->metadata_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->metadata, other->metadata, self->metadata_length * sizeof(char)) == 0 && tsk_memcmp(self->metadata_schema, other->metadata_schema, self->metadata_schema_length * sizeof(char)) == 0; } return ret; } int tsk_individual_table_keep_rows(tsk_individual_table_t *self, const tsk_bool_t *keep, tsk_flags_t TSK_UNUSED(options), tsk_id_t *ret_id_map) { int ret = 0; const tsk_size_t current_num_rows = self->num_rows; tsk_size_t j, k, remaining_rows; tsk_id_t pk; tsk_id_t *id_map = ret_id_map; tsk_id_t *restrict parents = self->parents; tsk_size_t *restrict parents_offset = self->parents_offset; if (ret_id_map == NULL) { id_map = tsk_malloc(current_num_rows * sizeof(*id_map)); if (id_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } } keep_mask_to_id_map(current_num_rows, keep, id_map); /* See notes in tsk_mutation_table_keep_rows for possibilities * on making this more flexible */ for (j = 0; j < current_num_rows; j++) { if (keep[j]) { for (k = parents_offset[j]; k < parents_offset[j + 1]; k++) { pk = parents[k]; if (pk != TSK_NULL) { if (pk < 0 || pk >= (tsk_id_t) current_num_rows) { ret = tsk_trace_error(TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); ; goto out; } if (id_map[pk] == TSK_NULL) { ret = tsk_trace_error(TSK_ERR_KEEP_ROWS_MAP_TO_DELETED); goto out; } } } } } remaining_rows = subset_flags_column(self->flags, current_num_rows, keep); self->parents_length = subset_remap_ragged_id_column( self->parents, self->parents_offset, current_num_rows, keep, id_map); self->location_length = subset_ragged_double_column( self->location, self->location_offset, current_num_rows, keep); if (self->metadata_length > 0) { /* Implementation note: we special case metadata here because * it'll make the common-case of no metadata a bit faster, and * to also potentially support more general use of the * TSK_TABLE_NO_METADATA option. This is done for all the tables * but only commented on here. */ self->metadata_length = subset_ragged_char_column( self->metadata, self->metadata_offset, current_num_rows, keep); } self->num_rows = remaining_rows; out: if (ret_id_map == NULL) { tsk_safe_free(id_map); } return ret; } static int tsk_individual_table_dump( const tsk_individual_table_t *self, kastore_t *store, tsk_flags_t options) { const write_table_col_t write_cols[] = { { "individuals/flags", (void *) self->flags, self->num_rows, TSK_FLAGS_STORAGE_TYPE }, { "individuals/metadata_schema", (void *) self->metadata_schema, self->metadata_schema_length, KAS_UINT8 }, { .name = NULL }, }; const write_table_ragged_col_t ragged_cols[] = { { "individuals/location", (void *) self->location, self->location_length, KAS_FLOAT64, self->location_offset, self->num_rows }, { "individuals/parents", (void *) self->parents, self->parents_length, TSK_ID_STORAGE_TYPE, self->parents_offset, self->num_rows }, { "individuals/metadata", (void *) self->metadata, self->metadata_length, KAS_UINT8, self->metadata_offset, self->num_rows }, { .name = NULL }, }; return write_table(store, write_cols, ragged_cols, options); } static int tsk_individual_table_load(tsk_individual_table_t *self, kastore_t *store) { int ret = 0; tsk_flags_t *flags = NULL; double *location = NULL; tsk_size_t *location_offset = NULL; tsk_id_t *parents = NULL; tsk_size_t *parents_offset = NULL; char *metadata = NULL; tsk_size_t *metadata_offset = NULL; char *metadata_schema = NULL; tsk_size_t num_rows, location_length, parents_length, metadata_length, metadata_schema_length; read_table_col_t cols[] = { { "individuals/flags", (void **) &flags, TSK_FLAGS_STORAGE_TYPE, 0 }, { .name = NULL }, }; read_table_ragged_col_t ragged_cols[] = { { "individuals/location", (void **) &location, &location_length, KAS_FLOAT64, &location_offset, 0 }, { "individuals/parents", (void **) &parents, &parents_length, TSK_ID_STORAGE_TYPE, &parents_offset, TSK_COL_OPTIONAL }, { "individuals/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, &metadata_offset, 0 }, { .name = NULL }, }; read_table_property_t properties[] = { { "individuals/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, { .name = NULL }, }; ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); if (ret != 0) { goto out; } if (metadata_schema != NULL) { ret = tsk_individual_table_set_metadata_schema( self, metadata_schema, metadata_schema_length); if (ret != 0) { goto out; } } ret = tsk_individual_table_takeset_columns(self, num_rows, flags, location, location_offset, parents, parents_offset, metadata, metadata_offset); if (ret != 0) { goto out; } flags = NULL; location = NULL; location_offset = NULL; parents = NULL; parents_offset = NULL; metadata = NULL; metadata_offset = NULL; out: free_read_table_mem(cols, ragged_cols, properties); return ret; } /************************* * node table *************************/ static void tsk_node_table_free_columns(tsk_node_table_t *self) { tsk_safe_free(self->flags); tsk_safe_free(self->time); tsk_safe_free(self->population); tsk_safe_free(self->individual); tsk_safe_free(self->metadata); tsk_safe_free(self->metadata_offset); } int tsk_node_table_free(tsk_node_table_t *self) { tsk_node_table_free_columns(self); tsk_safe_free(self->metadata_schema); return 0; } static int tsk_node_table_expand_main_columns(tsk_node_table_t *self, tsk_size_t additional_rows) { int ret = 0; tsk_size_t new_max_rows; ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, additional_rows, &new_max_rows); if (ret != 0) { goto out; } if (new_max_rows > self->max_rows) { ret = expand_column((void **) &self->flags, new_max_rows, sizeof(tsk_flags_t)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->time, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->population, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->individual, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } ret = expand_column( (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } self->max_rows = new_max_rows; } out: return ret; } static int tsk_node_table_expand_metadata(tsk_node_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->metadata_length, additional_length, self->max_metadata_length_increment, &self->max_metadata_length, (void **) &self->metadata, sizeof(*self->metadata)); } int tsk_node_table_set_max_rows_increment( tsk_node_table_t *self, tsk_size_t max_rows_increment) { self->max_rows_increment = max_rows_increment; return 0; } int tsk_node_table_set_max_metadata_length_increment( tsk_node_table_t *self, tsk_size_t max_metadata_length_increment) { self->max_metadata_length_increment = max_metadata_length_increment; return 0; } int tsk_node_table_init(tsk_node_table_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_memset(self, 0, sizeof(tsk_node_table_t)); /* Allocate space for one row initially, ensuring we always have valid pointers * even if the table is empty */ self->max_rows_increment = 1; self->max_metadata_length_increment = 1; ret = tsk_node_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_node_table_expand_metadata(self, 1); if (ret != 0) { goto out; } self->metadata_offset[0] = 0; self->max_rows_increment = 0; self->max_metadata_length_increment = 0; tsk_node_table_set_metadata_schema(self, NULL, 0); out: return ret; } int TSK_WARN_UNUSED tsk_node_table_copy( const tsk_node_table_t *self, tsk_node_table_t *dest, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_node_table_init(dest, 0); if (ret != 0) { goto out; } } ret = tsk_node_table_set_columns(dest, self->num_rows, self->flags, self->time, self->population, self->individual, self->metadata, self->metadata_offset); if (ret != 0) { goto out; } ret = tsk_node_table_set_metadata_schema( dest, self->metadata_schema, self->metadata_schema_length); out: return ret; } int TSK_WARN_UNUSED tsk_node_table_set_columns(tsk_node_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *time, const tsk_id_t *population, const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_offset) { int ret; ret = tsk_node_table_clear(self); if (ret != 0) { goto out; } ret = tsk_node_table_append_columns( self, num_rows, flags, time, population, individual, metadata, metadata_offset); out: return ret; } int TSK_WARN_UNUSED tsk_node_table_takeset_columns(tsk_node_table_t *self, tsk_size_t num_rows, tsk_flags_t *flags, double *time, tsk_id_t *population, tsk_id_t *individual, char *metadata, tsk_size_t *metadata_offset) { int ret = 0; /* We need to check all the inputs before we start freeing or taking memory */ if (flags == NULL || time == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = check_ragged_column(num_rows, metadata, metadata_offset); if (ret != 0) { goto out; } tsk_node_table_free_columns(self); self->num_rows = num_rows; self->max_rows = num_rows; self->flags = flags; self->time = time; ret = takeset_optional_id_column(num_rows, population, &self->population); if (ret != 0) { goto out; } ret = takeset_optional_id_column(num_rows, individual, &self->individual); if (ret != 0) { goto out; } ret = takeset_ragged_column(num_rows, metadata, metadata_offset, (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); if (ret != 0) { goto out; } out: return ret; } int tsk_node_table_append_columns(tsk_node_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *time, const tsk_id_t *population, const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_offset) { int ret; tsk_size_t j, metadata_length; if (flags == NULL || time == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if ((metadata == NULL) != (metadata_offset == NULL)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = tsk_node_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } tsk_memcpy(self->time + self->num_rows, time, num_rows * sizeof(double)); tsk_memcpy(self->flags + self->num_rows, flags, num_rows * sizeof(tsk_flags_t)); if (metadata == NULL) { for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j + 1] = self->metadata_length; } } else { ret = check_offsets(num_rows, metadata_offset, 0, false); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j] = (tsk_size_t) self->metadata_length + metadata_offset[j]; } metadata_length = metadata_offset[num_rows]; ret = tsk_node_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } tsk_memcpy(self->metadata + self->metadata_length, metadata, metadata_length * sizeof(char)); self->metadata_length += metadata_length; } if (population == NULL) { /* Set population to NULL_POPULATION (-1) if not specified */ tsk_memset(self->population + self->num_rows, 0xff, num_rows * sizeof(tsk_id_t)); } else { tsk_memcpy( self->population + self->num_rows, population, num_rows * sizeof(tsk_id_t)); } if (individual == NULL) { /* Set individual to NULL_INDIVIDUAL (-1) if not specified */ tsk_memset(self->individual + self->num_rows, 0xff, num_rows * sizeof(tsk_id_t)); } else { tsk_memcpy( self->individual + self->num_rows, individual, num_rows * sizeof(tsk_id_t)); } self->num_rows += (tsk_size_t) num_rows; self->metadata_offset[self->num_rows] = self->metadata_length; out: return ret; } static tsk_id_t tsk_node_table_add_row_internal(tsk_node_table_t *self, tsk_flags_t flags, double time, tsk_id_t population, tsk_id_t individual, const char *metadata, tsk_size_t metadata_length) { tsk_bug_assert(self->num_rows < self->max_rows); tsk_bug_assert(self->metadata_length + metadata_length <= self->max_metadata_length); tsk_memmove(self->metadata + self->metadata_length, metadata, metadata_length); self->flags[self->num_rows] = flags; self->time[self->num_rows] = time; self->population[self->num_rows] = population; self->individual[self->num_rows] = individual; self->metadata_offset[self->num_rows + 1] = self->metadata_length + metadata_length; self->metadata_length += metadata_length; self->num_rows++; return (tsk_id_t) self->num_rows - 1; } tsk_id_t tsk_node_table_add_row(tsk_node_table_t *self, tsk_flags_t flags, double time, tsk_id_t population, tsk_id_t individual, const char *metadata, tsk_size_t metadata_length) { tsk_id_t ret = 0; ret = tsk_node_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_node_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } ret = tsk_node_table_add_row_internal( self, flags, time, population, individual, metadata, metadata_length); out: return ret; } static int tsk_node_table_update_row_rewrite(tsk_node_table_t *self, tsk_id_t index, tsk_flags_t flags, double time, tsk_id_t population, tsk_id_t individual, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_id_t j, ret_id; tsk_node_table_t copy; tsk_size_t num_rows; tsk_id_t *rows = NULL; ret = tsk_node_table_copy(self, ©, 0); if (ret != 0) { goto out; } rows = tsk_malloc(self->num_rows * sizeof(*rows)); if (rows == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_node_table_truncate(self, (tsk_size_t) index); tsk_bug_assert(ret == 0); ret_id = tsk_node_table_add_row( self, flags, time, population, individual, metadata, metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } num_rows = 0; for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { rows[num_rows] = j; num_rows++; } ret = tsk_node_table_extend(self, ©, num_rows, rows, 0); if (ret != 0) { goto out; } out: tsk_node_table_free(©); tsk_safe_free(rows); return ret; } int tsk_node_table_update_row(tsk_node_table_t *self, tsk_id_t index, tsk_flags_t flags, double time, tsk_id_t population, tsk_id_t individual, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_node_t current_row; ret = tsk_node_table_get_row(self, index, ¤t_row); if (ret != 0) { goto out; } if (current_row.metadata_length == metadata_length) { self->flags[index] = flags; self->time[index] = time; self->population[index] = population; self->individual[index] = individual; /* Note: important to use tsk_memmove here as we may be provided pointers * to the column memory as input via get_row */ tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, metadata_length * sizeof(*metadata)); } else { ret = tsk_node_table_update_row_rewrite( self, index, flags, time, population, individual, metadata, metadata_length); if (ret != 0) { goto out; } } out: return ret; } int TSK_WARN_UNUSED tsk_node_table_clear(tsk_node_table_t *self) { return tsk_node_table_truncate(self, 0); } int tsk_node_table_truncate(tsk_node_table_t *self, tsk_size_t num_rows) { int ret = 0; if (num_rows > self->num_rows) { ret = tsk_trace_error(TSK_ERR_BAD_TABLE_POSITION); goto out; } self->num_rows = num_rows; self->metadata_length = self->metadata_offset[num_rows]; out: return ret; } int tsk_node_table_extend(tsk_node_table_t *self, const tsk_node_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_node_t node; if (self == other) { ret = tsk_trace_error(TSK_ERR_CANNOT_EXTEND_FROM_SELF); goto out; } /* We know how much to expand the non-ragged columns, so do it ahead of time */ ret = tsk_node_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { ret = tsk_node_table_get_row( other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &node); if (ret != 0) { goto out; } ret_id = tsk_node_table_add_row(self, node.flags, node.time, node.population, node.individual, node.metadata, node.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; out: return ret; } void tsk_node_table_print_state(const tsk_node_table_t *self, FILE *out) { tsk_size_t j, k; fprintf(out, "\n" TABLE_SEP); fprintf(out, "tsk_node_tbl: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, (long long) self->max_rows_increment); fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->metadata_length, (long long) self->max_metadata_length, (long long) self->max_metadata_length_increment); fprintf(out, TABLE_SEP); /* We duplicate the dump_text code here for simplicity because we want to output * the flags column directly. */ write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); fprintf(out, "id\tflags\ttime\tpopulation\tindividual\tmetadata_offset\tmetadata\n"); for (j = 0; j < self->num_rows; j++) { fprintf(out, "%lld\t%lld\t%f\t%lld\t%lld\t%lld\t", (long long) j, (long long) self->flags[j], self->time[j], (long long) self->population[j], (long long) self->individual[j], (long long) self->metadata_offset[j]); for (k = self->metadata_offset[j]; k < self->metadata_offset[j + 1]; k++) { fprintf(out, "%c", self->metadata[k]); } fprintf(out, "\n"); } tsk_bug_assert(self->metadata_offset[0] == 0); tsk_bug_assert(self->metadata_offset[self->num_rows] == self->metadata_length); } int tsk_node_table_set_metadata_schema(tsk_node_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length) { return replace_string(&self->metadata_schema, &self->metadata_schema_length, metadata_schema, metadata_schema_length); } int tsk_node_table_dump_text(const tsk_node_table_t *self, FILE *out) { int ret = TSK_ERR_IO; tsk_size_t j; tsk_size_t metadata_len; int err; err = write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); if (err < 0) { goto out; } err = fprintf(out, "id\tis_sample\ttime\tpopulation\tindividual\tmetadata\n"); if (err < 0) { goto out; } for (j = 0; j < self->num_rows; j++) { metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; err = fprintf(out, "%lld\t%lld\t%f\t%lld\t%lld\t%.*s\n", (long long) j, (long long) (self->flags[j] & TSK_NODE_IS_SAMPLE), self->time[j], (long long) self->population[j], (long long) self->individual[j], (int) metadata_len, self->metadata + self->metadata_offset[j]); if (err < 0) { goto out; } } ret = 0; out: return ret; } bool tsk_node_table_equals( const tsk_node_table_t *self, const tsk_node_table_t *other, tsk_flags_t options) { bool ret = self->num_rows == other->num_rows && tsk_memcmp(self->time, other->time, self->num_rows * sizeof(double)) == 0 && tsk_memcmp(self->flags, other->flags, self->num_rows * sizeof(tsk_flags_t)) == 0 && tsk_memcmp( self->population, other->population, self->num_rows * sizeof(tsk_id_t)) == 0 && tsk_memcmp( self->individual, other->individual, self->num_rows * sizeof(tsk_id_t)) == 0; if (!(options & TSK_CMP_IGNORE_METADATA)) { ret = ret && self->metadata_length == other->metadata_length && self->metadata_schema_length == other->metadata_schema_length && tsk_memcmp(self->metadata_offset, other->metadata_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->metadata, other->metadata, self->metadata_length * sizeof(char)) == 0 && tsk_memcmp(self->metadata_schema, other->metadata_schema, self->metadata_schema_length * sizeof(char)) == 0; } return ret; } static inline void tsk_node_table_get_row_unsafe( const tsk_node_table_t *self, tsk_id_t index, tsk_node_t *row) { row->id = (tsk_id_t) index; row->flags = self->flags[index]; row->time = self->time[index]; row->population = self->population[index]; row->individual = self->individual[index]; row->metadata_length = self->metadata_offset[index + 1] - self->metadata_offset[index]; row->metadata = self->metadata + self->metadata_offset[index]; } int tsk_node_table_get_row(const tsk_node_table_t *self, tsk_id_t index, tsk_node_t *row) { int ret = 0; if (index < 0 || index >= (tsk_id_t) self->num_rows) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } tsk_node_table_get_row_unsafe(self, index, row); out: return ret; } int tsk_node_table_keep_rows(tsk_node_table_t *self, const tsk_bool_t *keep, tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) { int ret = 0; tsk_size_t remaining_rows; if (id_map != NULL) { keep_mask_to_id_map(self->num_rows, keep, id_map); } remaining_rows = subset_flags_column(self->flags, self->num_rows, keep); subset_double_column(self->time, self->num_rows, keep); subset_id_column(self->population, self->num_rows, keep); subset_id_column(self->individual, self->num_rows, keep); if (self->metadata_length > 0) { self->metadata_length = subset_ragged_char_column( self->metadata, self->metadata_offset, self->num_rows, keep); } self->num_rows = remaining_rows; return ret; } static int tsk_node_table_dump(const tsk_node_table_t *self, kastore_t *store, tsk_flags_t options) { const write_table_col_t cols[] = { { "nodes/time", (void *) self->time, self->num_rows, KAS_FLOAT64 }, { "nodes/flags", (void *) self->flags, self->num_rows, TSK_FLAGS_STORAGE_TYPE }, { "nodes/population", (void *) self->population, self->num_rows, TSK_ID_STORAGE_TYPE }, { "nodes/individual", (void *) self->individual, self->num_rows, TSK_ID_STORAGE_TYPE }, { "nodes/metadata_schema", (void *) self->metadata_schema, self->metadata_schema_length, KAS_UINT8 }, { .name = NULL }, }; const write_table_ragged_col_t ragged_cols[] = { { "nodes/metadata", (void *) self->metadata, self->metadata_length, KAS_UINT8, self->metadata_offset, self->num_rows }, { .name = NULL }, }; return write_table(store, cols, ragged_cols, options); } static int tsk_node_table_load(tsk_node_table_t *self, kastore_t *store) { int ret = 0; char *metadata_schema = NULL; double *time = NULL; tsk_flags_t *flags = NULL; tsk_id_t *population = NULL; tsk_id_t *individual = NULL; char *metadata = NULL; tsk_size_t *metadata_offset = NULL; tsk_size_t num_rows, metadata_length, metadata_schema_length; read_table_col_t cols[] = { { "nodes/time", (void **) &time, KAS_FLOAT64, 0 }, { "nodes/flags", (void **) &flags, TSK_FLAGS_STORAGE_TYPE, 0 }, { "nodes/population", (void **) &population, TSK_ID_STORAGE_TYPE, 0 }, { "nodes/individual", (void **) &individual, TSK_ID_STORAGE_TYPE, 0 }, { .name = NULL }, }; read_table_ragged_col_t ragged_cols[] = { { "nodes/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, &metadata_offset, 0 }, { .name = NULL }, }; read_table_property_t properties[] = { { "nodes/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, { .name = NULL }, }; ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); if (ret != 0) { goto out; } if (metadata_schema != NULL) { ret = tsk_node_table_set_metadata_schema( self, metadata_schema, metadata_schema_length); if (ret != 0) { goto out; } } ret = tsk_node_table_takeset_columns( self, num_rows, flags, time, population, individual, metadata, metadata_offset); if (ret != 0) { goto out; } flags = NULL; time = NULL; population = NULL; individual = NULL; metadata = NULL; metadata_offset = NULL; out: free_read_table_mem(cols, ragged_cols, properties); return ret; } /************************* * edge table *************************/ static void tsk_edge_table_free_columns(tsk_edge_table_t *self) { tsk_safe_free(self->left); tsk_safe_free(self->right); tsk_safe_free(self->parent); tsk_safe_free(self->child); tsk_safe_free(self->metadata); tsk_safe_free(self->metadata_offset); } int tsk_edge_table_free(tsk_edge_table_t *self) { tsk_edge_table_free_columns(self); tsk_safe_free(self->metadata_schema); return 0; } static int tsk_edge_table_has_metadata(const tsk_edge_table_t *self) { return !(self->options & TSK_TABLE_NO_METADATA); } static int tsk_edge_table_expand_main_columns(tsk_edge_table_t *self, tsk_size_t additional_rows) { int ret = 0; tsk_size_t new_max_rows; ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, additional_rows, &new_max_rows); if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { ret = expand_column((void **) &self->left, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->right, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->parent, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->child, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } if (tsk_edge_table_has_metadata(self)) { ret = expand_column( (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } } self->max_rows = new_max_rows; } out: return ret; } static int tsk_edge_table_expand_metadata(tsk_edge_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->metadata_length, additional_length, self->max_metadata_length_increment, &self->max_metadata_length, (void **) &self->metadata, sizeof(*self->metadata)); } int tsk_edge_table_set_max_rows_increment( tsk_edge_table_t *self, tsk_size_t max_rows_increment) { self->max_rows_increment = max_rows_increment; return 0; } int tsk_edge_table_set_max_metadata_length_increment( tsk_edge_table_t *self, tsk_size_t max_metadata_length_increment) { self->max_metadata_length_increment = max_metadata_length_increment; return 0; } int tsk_edge_table_init(tsk_edge_table_t *self, tsk_flags_t options) { int ret = 0; tsk_memset(self, 0, sizeof(*self)); self->options = options; /* Allocate space for one row initially, ensuring we always have valid * pointers even if the table is empty */ self->max_rows_increment = 1; self->max_metadata_length_increment = 1; ret = tsk_edge_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } if (tsk_edge_table_has_metadata(self)) { ret = tsk_edge_table_expand_metadata(self, 1); if (ret != 0) { goto out; } self->metadata_offset[0] = 0; } self->max_rows_increment = 0; self->max_metadata_length_increment = 0; tsk_edge_table_set_metadata_schema(self, NULL, 0); out: return ret; } tsk_id_t tsk_edge_table_add_row(tsk_edge_table_t *self, double left, double right, tsk_id_t parent, tsk_id_t child, const char *metadata, tsk_size_t metadata_length) { tsk_id_t ret = 0; if (metadata_length > 0 && !tsk_edge_table_has_metadata(self)) { ret = tsk_trace_error(TSK_ERR_METADATA_DISABLED); goto out; } ret = tsk_edge_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } tsk_bug_assert(self->num_rows < self->max_rows); self->left[self->num_rows] = left; self->right[self->num_rows] = right; self->parent[self->num_rows] = parent; self->child[self->num_rows] = child; if (tsk_edge_table_has_metadata(self)) { ret = tsk_edge_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } tsk_bug_assert( self->metadata_length + metadata_length <= self->max_metadata_length); tsk_memmove(self->metadata + self->metadata_length, metadata, metadata_length); self->metadata_offset[self->num_rows + 1] = self->metadata_length + metadata_length; self->metadata_length += metadata_length; } ret = (tsk_id_t) self->num_rows; self->num_rows++; out: return ret; } static int tsk_edge_table_update_row_rewrite(tsk_edge_table_t *self, tsk_id_t index, double left, double right, tsk_id_t parent, tsk_id_t child, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_id_t j, ret_id; tsk_edge_table_t copy; tsk_size_t num_rows; tsk_id_t *rows = NULL; ret = tsk_edge_table_copy(self, ©, 0); if (ret != 0) { goto out; } rows = tsk_malloc(self->num_rows * sizeof(*rows)); if (rows == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_edge_table_truncate(self, (tsk_size_t) index); tsk_bug_assert(ret == 0); ret_id = tsk_edge_table_add_row( self, left, right, parent, child, metadata, metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } num_rows = 0; for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { rows[num_rows] = j; num_rows++; } ret = tsk_edge_table_extend(self, ©, num_rows, rows, 0); if (ret != 0) { goto out; } out: tsk_edge_table_free(©); tsk_safe_free(rows); return ret; } int tsk_edge_table_update_row(tsk_edge_table_t *self, tsk_id_t index, double left, double right, tsk_id_t parent, tsk_id_t child, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_edge_t current_row; ret = tsk_edge_table_get_row(self, index, ¤t_row); if (ret != 0) { goto out; } if (current_row.metadata_length == metadata_length) { self->left[index] = left; self->right[index] = right; self->parent[index] = parent; self->child[index] = child; if (tsk_edge_table_has_metadata(self)) { /* Note: important to use tsk_memmove here as we may be provided pointers * to the column memory as input via get_row */ tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, metadata_length * sizeof(*metadata)); } } else { ret = tsk_edge_table_update_row_rewrite( self, index, left, right, parent, child, metadata, metadata_length); if (ret != 0) { goto out; } } out: return ret; } int TSK_WARN_UNUSED tsk_edge_table_copy( const tsk_edge_table_t *self, tsk_edge_table_t *dest, tsk_flags_t options) { int ret = 0; char *metadata = NULL; tsk_size_t *metadata_offset = NULL; if (!(options & TSK_NO_INIT)) { ret = tsk_edge_table_init(dest, options); if (ret != 0) { goto out; } } /* We can't use TSK_TABLE_NO_METADATA in dest if metadata_length is non-zero. * This also captures the case where TSK_TABLE_NO_METADATA is set on this table. */ if (self->metadata_length > 0 && !tsk_edge_table_has_metadata(dest)) { ret = tsk_trace_error(TSK_ERR_METADATA_DISABLED); goto out; } if (tsk_edge_table_has_metadata(dest)) { metadata = self->metadata; metadata_offset = self->metadata_offset; } ret = tsk_edge_table_set_columns(dest, self->num_rows, self->left, self->right, self->parent, self->child, metadata, metadata_offset); if (ret != 0) { goto out; } ret = tsk_edge_table_set_metadata_schema( dest, self->metadata_schema, self->metadata_schema_length); out: return ret; } int tsk_edge_table_set_columns(tsk_edge_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *parent, const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_offset) { int ret = 0; ret = tsk_edge_table_clear(self); if (ret != 0) { goto out; } ret = tsk_edge_table_append_columns( self, num_rows, left, right, parent, child, metadata, metadata_offset); out: return ret; } int TSK_WARN_UNUSED tsk_edge_table_takeset_columns(tsk_edge_table_t *self, tsk_size_t num_rows, double *left, double *right, tsk_id_t *parent, tsk_id_t *child, char *metadata, tsk_size_t *metadata_offset) { int ret = 0; /* We need to check all the inputs before we start freeing or taking memory */ if (left == NULL || right == NULL || parent == NULL || child == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if (metadata != NULL && !tsk_edge_table_has_metadata(self)) { ret = tsk_trace_error(TSK_ERR_METADATA_DISABLED); goto out; } ret = check_ragged_column(num_rows, metadata, metadata_offset); if (ret != 0) { goto out; } tsk_edge_table_free_columns(self); self->num_rows = num_rows; self->max_rows = num_rows; self->left = left; self->right = right; self->parent = parent; self->child = child; ret = takeset_ragged_column(num_rows, metadata, metadata_offset, (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); if (ret != 0) { goto out; } out: return ret; } int tsk_edge_table_append_columns(tsk_edge_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *parent, const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_offset) { int ret; tsk_size_t j, metadata_length; if (left == NULL || right == NULL || parent == NULL || child == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if ((metadata == NULL) != (metadata_offset == NULL)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if (metadata != NULL && !tsk_edge_table_has_metadata(self)) { ret = tsk_trace_error(TSK_ERR_METADATA_DISABLED); goto out; } ret = tsk_edge_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } tsk_memcpy(self->left + self->num_rows, left, num_rows * sizeof(double)); tsk_memcpy(self->right + self->num_rows, right, num_rows * sizeof(double)); tsk_memcpy(self->parent + self->num_rows, parent, num_rows * sizeof(tsk_id_t)); tsk_memcpy(self->child + self->num_rows, child, num_rows * sizeof(tsk_id_t)); if (tsk_edge_table_has_metadata(self)) { if (metadata == NULL) { for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j + 1] = self->metadata_length; } } else { ret = check_offsets(num_rows, metadata_offset, 0, false); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j] = (tsk_size_t) self->metadata_length + metadata_offset[j]; } metadata_length = metadata_offset[num_rows]; ret = tsk_edge_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } tsk_memcpy(self->metadata + self->metadata_length, metadata, metadata_length * sizeof(char)); self->metadata_length += metadata_length; } self->num_rows += num_rows; self->metadata_offset[self->num_rows] = self->metadata_length; } else { self->num_rows += num_rows; } out: return ret; } int tsk_edge_table_clear(tsk_edge_table_t *self) { return tsk_edge_table_truncate(self, 0); } int tsk_edge_table_truncate(tsk_edge_table_t *self, tsk_size_t num_rows) { int ret = 0; if (num_rows > self->num_rows) { ret = tsk_trace_error(TSK_ERR_BAD_TABLE_POSITION); goto out; } self->num_rows = num_rows; if (tsk_edge_table_has_metadata(self)) { self->metadata_length = self->metadata_offset[num_rows]; } out: return ret; } int tsk_edge_table_extend(tsk_edge_table_t *self, const tsk_edge_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_edge_t edge; if (self == other) { ret = tsk_trace_error(TSK_ERR_CANNOT_EXTEND_FROM_SELF); goto out; } /* We know how much to expand the non-ragged columns, so do it ahead of time */ ret = tsk_edge_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { ret = tsk_edge_table_get_row( other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &edge); if (ret != 0) { goto out; } ret_id = tsk_edge_table_add_row(self, edge.left, edge.right, edge.parent, edge.child, edge.metadata, edge.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; out: return ret; } static inline void tsk_edge_table_get_row_unsafe( const tsk_edge_table_t *self, tsk_id_t index, tsk_edge_t *row) { row->id = (tsk_id_t) index; row->left = self->left[index]; row->right = self->right[index]; row->parent = self->parent[index]; row->child = self->child[index]; if (tsk_edge_table_has_metadata(self)) { row->metadata_length = self->metadata_offset[index + 1] - self->metadata_offset[index]; row->metadata = self->metadata + self->metadata_offset[index]; } else { row->metadata_length = 0; row->metadata = NULL; } } int tsk_edge_table_get_row(const tsk_edge_table_t *self, tsk_id_t index, tsk_edge_t *row) { int ret = 0; if (index < 0 || index >= (tsk_id_t) self->num_rows) { ret = tsk_trace_error(TSK_ERR_EDGE_OUT_OF_BOUNDS); goto out; } tsk_edge_table_get_row_unsafe(self, index, row); out: return ret; } void tsk_edge_table_print_state(const tsk_edge_table_t *self, FILE *out) { int ret; fprintf(out, "\n" TABLE_SEP); fprintf(out, "edge_table: %p:\n", (const void *) self); fprintf(out, "options = 0x%X\n", self->options); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, (long long) self->max_rows_increment); fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->metadata_length, (long long) self->max_metadata_length, (long long) self->max_metadata_length_increment); fprintf(out, TABLE_SEP); ret = tsk_edge_table_dump_text(self, out); tsk_bug_assert(ret == 0); } int tsk_edge_table_set_metadata_schema(tsk_edge_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length) { return replace_string(&self->metadata_schema, &self->metadata_schema_length, metadata_schema, metadata_schema_length); } int tsk_edge_table_dump_text(const tsk_edge_table_t *self, FILE *out) { tsk_id_t j; int ret = TSK_ERR_IO; tsk_edge_t row; int err; err = write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); if (err < 0) { goto out; } err = fprintf(out, "id\tleft\tright\tparent\tchild\tmetadata\n"); if (err < 0) { goto out; } for (j = 0; j < (tsk_id_t) self->num_rows; j++) { tsk_edge_table_get_row_unsafe(self, j, &row); err = fprintf(out, "%lld\t%.3f\t%.3f\t%lld\t%lld\t%.*s\n", (long long) j, row.left, row.right, (long long) row.parent, (long long) row.child, (int) row.metadata_length, row.metadata); if (err < 0) { goto out; } } ret = 0; out: return ret; } bool tsk_edge_table_equals( const tsk_edge_table_t *self, const tsk_edge_table_t *other, tsk_flags_t options) { bool metadata_equal; bool ret = self->num_rows == other->num_rows && tsk_memcmp(self->left, other->left, self->num_rows * sizeof(double)) == 0 && tsk_memcmp(self->right, other->right, self->num_rows * sizeof(double)) == 0 && tsk_memcmp(self->parent, other->parent, self->num_rows * sizeof(tsk_id_t)) == 0 && tsk_memcmp(self->child, other->child, self->num_rows * sizeof(tsk_id_t)) == 0; if (!(options & TSK_CMP_IGNORE_METADATA)) { ret = ret && self->metadata_schema_length == other->metadata_schema_length && tsk_memcmp(self->metadata_schema, other->metadata_schema, self->metadata_schema_length * sizeof(char)) == 0; metadata_equal = false; if (self->metadata_length == other->metadata_length) { if (tsk_edge_table_has_metadata(self) && tsk_edge_table_has_metadata(other)) { metadata_equal = tsk_memcmp(self->metadata_offset, other->metadata_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->metadata, other->metadata, self->metadata_length * sizeof(char)) == 0; } else { /* The only way that the metadata lengths can be equal (which * we've already tested) and either one or the other of the tables * hasn't got metadata is if they are both zero length. */ tsk_bug_assert(self->metadata_length == 0); metadata_equal = true; } } ret = ret && metadata_equal; } return ret; } int tsk_edge_table_keep_rows(tsk_edge_table_t *self, const tsk_bool_t *keep, tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) { int ret = 0; tsk_size_t remaining_rows; if (id_map != NULL) { keep_mask_to_id_map(self->num_rows, keep, id_map); } remaining_rows = subset_double_column(self->left, self->num_rows, keep); subset_double_column(self->right, self->num_rows, keep); subset_id_column(self->parent, self->num_rows, keep); subset_id_column(self->child, self->num_rows, keep); if (self->metadata_length > 0) { tsk_bug_assert(!(self->options & TSK_TABLE_NO_METADATA)); self->metadata_length = subset_ragged_char_column( self->metadata, self->metadata_offset, self->num_rows, keep); } self->num_rows = remaining_rows; return ret; } static int tsk_edge_table_dump(const tsk_edge_table_t *self, kastore_t *store, tsk_flags_t options) { int ret = 0; const write_table_col_t write_cols[] = { { "edges/left", (void *) self->left, self->num_rows, KAS_FLOAT64 }, { "edges/right", (void *) self->right, self->num_rows, KAS_FLOAT64 }, { "edges/parent", (void *) self->parent, self->num_rows, TSK_ID_STORAGE_TYPE }, { "edges/child", (void *) self->child, self->num_rows, TSK_ID_STORAGE_TYPE }, { "edges/metadata_schema", (void *) self->metadata_schema, self->metadata_schema_length, KAS_UINT8 }, { .name = NULL }, }; const write_table_ragged_col_t ragged_cols[] = { { "edges/metadata", (void *) self->metadata, self->metadata_length, KAS_UINT8, self->metadata_offset, self->num_rows }, { .name = NULL }, }; /* TODO when the general code has been updated to only write out the * column when the lenght of ragged columns is > 0 we can get rid of * this special case here and use write_table. */ ret = write_table_cols(store, write_cols, options); if (ret != 0) { goto out; } if (tsk_edge_table_has_metadata(self)) { ret = write_table_ragged_cols(store, ragged_cols, options); if (ret != 0) { goto out; } } out: return ret; } static int tsk_edge_table_load(tsk_edge_table_t *self, kastore_t *store) { int ret = 0; char *metadata_schema = NULL; double *left = NULL; double *right = NULL; tsk_id_t *parent = NULL; tsk_id_t *child = NULL; char *metadata = NULL; tsk_size_t *metadata_offset = NULL; tsk_size_t num_rows, metadata_length, metadata_schema_length; read_table_col_t cols[] = { { "edges/left", (void **) &left, KAS_FLOAT64, 0 }, { "edges/right", (void **) &right, KAS_FLOAT64, 0 }, { "edges/parent", (void **) &parent, TSK_ID_STORAGE_TYPE, 0 }, { "edges/child", (void **) &child, TSK_ID_STORAGE_TYPE, 0 }, { .name = NULL }, }; read_table_ragged_col_t ragged_cols[] = { { "edges/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, &metadata_offset, TSK_COL_OPTIONAL }, { .name = NULL }, }; read_table_property_t properties[] = { { "edges/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, { .name = NULL }, }; ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); if (ret != 0) { goto out; } if (metadata_schema != NULL) { ret = tsk_edge_table_set_metadata_schema( self, metadata_schema, metadata_schema_length); if (ret != 0) { goto out; } } ret = tsk_edge_table_takeset_columns( self, num_rows, left, right, parent, child, metadata, metadata_offset); if (ret != 0) { goto out; } left = NULL; right = NULL; parent = NULL; child = NULL; metadata = NULL; metadata_offset = NULL; out: free_read_table_mem(cols, ragged_cols, properties); return ret; } int tsk_edge_table_squash(tsk_edge_table_t *self) { int k; int ret = 0; tsk_edge_t *edges = NULL; tsk_size_t num_output_edges; if (self->metadata_length > 0) { ret = tsk_trace_error(TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA); goto out; } edges = tsk_malloc(self->num_rows * sizeof(tsk_edge_t)); if (edges == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (k = 0; k < (int) self->num_rows; k++) { edges[k].left = self->left[k]; edges[k].right = self->right[k]; edges[k].parent = self->parent[k]; edges[k].child = self->child[k]; edges[k].metadata_length = 0; } ret = tsk_squash_edges(edges, self->num_rows, &num_output_edges); if (ret != 0) { goto out; } tsk_edge_table_clear(self); tsk_bug_assert(num_output_edges <= self->max_rows); self->num_rows = num_output_edges; for (k = 0; k < (int) num_output_edges; k++) { self->left[k] = edges[k].left; self->right[k] = edges[k].right; self->parent[k] = edges[k].parent; self->child[k] = edges[k].child; } out: tsk_safe_free(edges); return ret; } /************************* * site table *************************/ static void tsk_site_table_free_columns(tsk_site_table_t *self) { tsk_safe_free(self->position); tsk_safe_free(self->ancestral_state); tsk_safe_free(self->ancestral_state_offset); tsk_safe_free(self->metadata); tsk_safe_free(self->metadata_offset); } int tsk_site_table_free(tsk_site_table_t *self) { tsk_site_table_free_columns(self); tsk_safe_free(self->metadata_schema); return 0; } static int tsk_site_table_expand_main_columns(tsk_site_table_t *self, tsk_size_t additional_rows) { int ret = 0; tsk_size_t new_max_rows; ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, additional_rows, &new_max_rows); if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { ret = expand_column((void **) &self->position, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->ancestral_state_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } ret = expand_column( (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } self->max_rows = new_max_rows; } out: return ret; } static int tsk_site_table_expand_ancestral_state( tsk_site_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->ancestral_state_length, additional_length, self->max_ancestral_state_length_increment, &self->max_ancestral_state_length, (void **) &self->ancestral_state, sizeof(*self->ancestral_state)); } static int tsk_site_table_expand_metadata(tsk_site_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->metadata_length, additional_length, self->max_metadata_length_increment, &self->max_metadata_length, (void **) &self->metadata, sizeof(*self->metadata)); } int tsk_site_table_set_max_rows_increment( tsk_site_table_t *self, tsk_size_t max_rows_increment) { self->max_rows_increment = max_rows_increment; return 0; } int tsk_site_table_set_max_metadata_length_increment( tsk_site_table_t *self, tsk_size_t max_metadata_length_increment) { self->max_metadata_length_increment = max_metadata_length_increment; return 0; } int tsk_site_table_set_max_ancestral_state_length_increment( tsk_site_table_t *self, tsk_size_t max_ancestral_state_length_increment) { self->max_ancestral_state_length_increment = max_ancestral_state_length_increment; return 0; } int tsk_site_table_init(tsk_site_table_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_memset(self, 0, sizeof(tsk_site_table_t)); /* Allocate space for one row initially, ensuring we always have valid pointers * even if the table is empty */ self->max_rows_increment = 1; self->max_ancestral_state_length_increment = 1; self->max_metadata_length_increment = 1; ret = tsk_site_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_site_table_expand_ancestral_state(self, 1); if (ret != 0) { goto out; } ret = tsk_site_table_expand_metadata(self, 1); if (ret != 0) { goto out; } self->ancestral_state_offset[0] = 0; self->metadata_offset[0] = 0; self->max_rows_increment = 0; self->max_ancestral_state_length_increment = 0; self->max_metadata_length_increment = 0; tsk_site_table_set_metadata_schema(self, NULL, 0); out: return ret; } tsk_id_t tsk_site_table_add_row(tsk_site_table_t *self, double position, const char *ancestral_state, tsk_size_t ancestral_state_length, const char *metadata, tsk_size_t metadata_length) { tsk_id_t ret = 0; tsk_size_t ancestral_state_offset, metadata_offset; ret = tsk_site_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } self->position[self->num_rows] = position; ancestral_state_offset = (tsk_size_t) self->ancestral_state_length; tsk_bug_assert( self->ancestral_state_offset[self->num_rows] == ancestral_state_offset); ret = tsk_site_table_expand_ancestral_state(self, ancestral_state_length); if (ret != 0) { goto out; } self->ancestral_state_length += ancestral_state_length; tsk_memmove(self->ancestral_state + ancestral_state_offset, ancestral_state, ancestral_state_length); self->ancestral_state_offset[self->num_rows + 1] = self->ancestral_state_length; metadata_offset = (tsk_size_t) self->metadata_length; tsk_bug_assert(self->metadata_offset[self->num_rows] == metadata_offset); ret = tsk_site_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } self->metadata_length += metadata_length; tsk_memmove(self->metadata + metadata_offset, metadata, metadata_length); self->metadata_offset[self->num_rows + 1] = self->metadata_length; ret = (tsk_id_t) self->num_rows; self->num_rows++; out: return ret; } static int tsk_site_table_update_row_rewrite(tsk_site_table_t *self, tsk_id_t index, double position, const char *ancestral_state, tsk_size_t ancestral_state_length, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_id_t j, ret_id; tsk_site_table_t copy; tsk_size_t num_rows; tsk_id_t *rows = NULL; ret = tsk_site_table_copy(self, ©, 0); if (ret != 0) { goto out; } rows = tsk_malloc(self->num_rows * sizeof(*rows)); if (rows == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_site_table_truncate(self, (tsk_size_t) index); tsk_bug_assert(ret == 0); ret_id = tsk_site_table_add_row(self, position, ancestral_state, ancestral_state_length, metadata, metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } num_rows = 0; for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { rows[num_rows] = j; num_rows++; } ret = tsk_site_table_extend(self, ©, num_rows, rows, 0); if (ret != 0) { goto out; } out: tsk_site_table_free(©); tsk_safe_free(rows); return ret; } int tsk_site_table_update_row(tsk_site_table_t *self, tsk_id_t index, double position, const char *ancestral_state, tsk_size_t ancestral_state_length, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_site_t current_row; ret = tsk_site_table_get_row(self, index, ¤t_row); if (ret != 0) { goto out; } if (current_row.metadata_length == metadata_length && current_row.ancestral_state_length == ancestral_state_length) { self->position[index] = position; /* Note: important to use tsk_memmove here as we may be provided pointers * to the column memory as input via get_row */ tsk_memmove(&self->ancestral_state[self->ancestral_state_offset[index]], ancestral_state, ancestral_state_length * sizeof(*ancestral_state)); tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, metadata_length * sizeof(*metadata)); } else { ret = tsk_site_table_update_row_rewrite(self, index, position, ancestral_state, ancestral_state_length, metadata, metadata_length); if (ret != 0) { goto out; } } out: return ret; } int tsk_site_table_append_columns(tsk_site_table_t *self, tsk_size_t num_rows, const double *position, const char *ancestral_state, const tsk_size_t *ancestral_state_offset, const char *metadata, const tsk_size_t *metadata_offset) { int ret = 0; tsk_size_t j, ancestral_state_length, metadata_length; if (position == NULL || ancestral_state == NULL || ancestral_state_offset == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if ((metadata == NULL) != (metadata_offset == NULL)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = tsk_site_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } tsk_memcpy(self->position + self->num_rows, position, num_rows * sizeof(double)); /* Metadata column */ if (metadata == NULL) { for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j + 1] = self->metadata_length; } } else { ret = check_offsets(num_rows, metadata_offset, 0, false); if (ret != 0) { goto out; } metadata_length = metadata_offset[num_rows]; ret = tsk_site_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } tsk_memcpy(self->metadata + self->metadata_length, metadata, metadata_length * sizeof(char)); for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j] = self->metadata_length + metadata_offset[j]; } self->metadata_length += metadata_length; } self->metadata_offset[self->num_rows + num_rows] = self->metadata_length; /* Ancestral state column */ ret = check_offsets(num_rows, ancestral_state_offset, 0, false); if (ret != 0) { goto out; } ancestral_state_length = ancestral_state_offset[num_rows]; ret = tsk_site_table_expand_ancestral_state(self, ancestral_state_length); if (ret != 0) { goto out; } tsk_memcpy(self->ancestral_state + self->ancestral_state_length, ancestral_state, ancestral_state_length * sizeof(char)); for (j = 0; j < num_rows; j++) { self->ancestral_state_offset[self->num_rows + j] = self->ancestral_state_length + ancestral_state_offset[j]; } self->ancestral_state_length += ancestral_state_length; self->ancestral_state_offset[self->num_rows + num_rows] = self->ancestral_state_length; self->num_rows += num_rows; out: return ret; } int TSK_WARN_UNUSED tsk_site_table_copy( const tsk_site_table_t *self, tsk_site_table_t *dest, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_site_table_init(dest, 0); if (ret != 0) { goto out; } } ret = tsk_site_table_set_columns(dest, self->num_rows, self->position, self->ancestral_state, self->ancestral_state_offset, self->metadata, self->metadata_offset); if (ret != 0) { goto out; } ret = tsk_site_table_set_metadata_schema( dest, self->metadata_schema, self->metadata_schema_length); out: return ret; } int tsk_site_table_set_columns(tsk_site_table_t *self, tsk_size_t num_rows, const double *position, const char *ancestral_state, const tsk_size_t *ancestral_state_offset, const char *metadata, const tsk_size_t *metadata_offset) { int ret = 0; ret = tsk_site_table_clear(self); if (ret != 0) { goto out; } ret = tsk_site_table_append_columns(self, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); out: return ret; } int tsk_site_table_takeset_columns(tsk_site_table_t *self, tsk_size_t num_rows, double *position, char *ancestral_state, tsk_size_t *ancestral_state_offset, char *metadata, tsk_size_t *metadata_offset) { int ret = 0; /* We need to check all the inputs before we start freeing or taking memory */ if (position == NULL || ancestral_state == NULL || ancestral_state_offset == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = check_ragged_column(num_rows, ancestral_state, ancestral_state_offset); if (ret != 0) { goto out; } ret = check_ragged_column(num_rows, metadata, metadata_offset); if (ret != 0) { goto out; } tsk_site_table_free_columns(self); self->num_rows = num_rows; self->max_rows = num_rows; self->position = position; ret = takeset_ragged_column(num_rows, ancestral_state, ancestral_state_offset, (void *) &self->ancestral_state, &self->ancestral_state_offset, &self->ancestral_state_length); if (ret != 0) { goto out; } ret = takeset_ragged_column(num_rows, metadata, metadata_offset, (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); if (ret != 0) { goto out; } out: return ret; } bool tsk_site_table_equals( const tsk_site_table_t *self, const tsk_site_table_t *other, tsk_flags_t options) { bool ret = self->num_rows == other->num_rows && self->ancestral_state_length == other->ancestral_state_length && tsk_memcmp(self->position, other->position, self->num_rows * sizeof(double)) == 0 && tsk_memcmp(self->ancestral_state_offset, other->ancestral_state_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->ancestral_state, other->ancestral_state, self->ancestral_state_length * sizeof(char)) == 0; if (!(options & TSK_CMP_IGNORE_METADATA)) { ret = ret && self->metadata_length == other->metadata_length && self->metadata_schema_length == other->metadata_schema_length && tsk_memcmp(self->metadata_offset, other->metadata_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->metadata, other->metadata, self->metadata_length * sizeof(char)) == 0 && tsk_memcmp(self->metadata_schema, other->metadata_schema, self->metadata_schema_length * sizeof(char)) == 0; } return ret; } int tsk_site_table_clear(tsk_site_table_t *self) { return tsk_site_table_truncate(self, 0); } int tsk_site_table_truncate(tsk_site_table_t *self, tsk_size_t num_rows) { int ret = 0; if (num_rows > self->num_rows) { ret = tsk_trace_error(TSK_ERR_BAD_TABLE_POSITION); goto out; } self->num_rows = num_rows; self->ancestral_state_length = self->ancestral_state_offset[num_rows]; self->metadata_length = self->metadata_offset[num_rows]; out: return ret; } int tsk_site_table_extend(tsk_site_table_t *self, const tsk_site_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_site_t site; if (self == other) { ret = tsk_trace_error(TSK_ERR_CANNOT_EXTEND_FROM_SELF); goto out; } /* We know how much to expand the non-ragged columns, so do it ahead of time */ ret = tsk_site_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { ret = tsk_site_table_get_row( other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &site); if (ret != 0) { goto out; } ret_id = tsk_site_table_add_row(self, site.position, site.ancestral_state, site.ancestral_state_length, site.metadata, site.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; out: return ret; } void tsk_site_table_print_state(const tsk_site_table_t *self, FILE *out) { int ret; fprintf(out, "\n" TABLE_SEP); fprintf(out, "site_table: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\t(max= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, (long long) self->max_rows_increment); fprintf(out, "ancestral_state_length = %lld\t(max= %lld\tincrement = %lld)\n", (long long) self->ancestral_state_length, (long long) self->max_ancestral_state_length, (long long) self->max_ancestral_state_length_increment); fprintf(out, "metadata_length = %lld(\tmax= %lld\tincrement = %lld)\n", (long long) self->metadata_length, (long long) self->max_metadata_length, (long long) self->max_metadata_length_increment); fprintf(out, TABLE_SEP); ret = tsk_site_table_dump_text(self, out); tsk_bug_assert(ret == 0); tsk_bug_assert(self->ancestral_state_offset[0] == 0); tsk_bug_assert( self->ancestral_state_length == self->ancestral_state_offset[self->num_rows]); tsk_bug_assert(self->metadata_offset[0] == 0); tsk_bug_assert(self->metadata_length == self->metadata_offset[self->num_rows]); } static inline void tsk_site_table_get_row_unsafe( const tsk_site_table_t *self, tsk_id_t index, tsk_site_t *row) { row->id = (tsk_id_t) index; row->position = self->position[index]; row->ancestral_state_length = self->ancestral_state_offset[index + 1] - self->ancestral_state_offset[index]; row->ancestral_state = self->ancestral_state + self->ancestral_state_offset[index]; row->metadata_length = self->metadata_offset[index + 1] - self->metadata_offset[index]; row->metadata = self->metadata + self->metadata_offset[index]; /* This struct has a placeholder for mutations. Probably should be separate * structs for this (tsk_site_table_row_t?) */ row->mutations_length = 0; row->mutations = NULL; } int tsk_site_table_get_row(const tsk_site_table_t *self, tsk_id_t index, tsk_site_t *row) { int ret = 0; if (index < 0 || index >= (tsk_id_t) self->num_rows) { ret = tsk_trace_error(TSK_ERR_SITE_OUT_OF_BOUNDS); goto out; } tsk_site_table_get_row_unsafe(self, index, row); out: return ret; } int tsk_site_table_set_metadata_schema(tsk_site_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length) { return replace_string(&self->metadata_schema, &self->metadata_schema_length, metadata_schema, metadata_schema_length); } int tsk_site_table_dump_text(const tsk_site_table_t *self, FILE *out) { tsk_size_t j; int ret = TSK_ERR_IO; int err; tsk_size_t ancestral_state_len, metadata_len; err = write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); if (err < 0) { goto out; } err = fprintf(out, "id\tposition\tancestral_state\tmetadata\n"); if (err < 0) { goto out; } for (j = 0; j < self->num_rows; j++) { ancestral_state_len = self->ancestral_state_offset[j + 1] - self->ancestral_state_offset[j]; metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; err = fprintf(out, "%lld\t%f\t%.*s\t%.*s\n", (long long) j, self->position[j], (int) ancestral_state_len, self->ancestral_state + self->ancestral_state_offset[j], (int) metadata_len, self->metadata + self->metadata_offset[j]); if (err < 0) { goto out; } } ret = 0; out: return ret; } int tsk_site_table_keep_rows(tsk_site_table_t *self, const tsk_bool_t *keep, tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) { int ret = 0; tsk_size_t remaining_rows; if (id_map != NULL) { keep_mask_to_id_map(self->num_rows, keep, id_map); } remaining_rows = subset_double_column(self->position, self->num_rows, keep); self->ancestral_state_length = subset_ragged_char_column( self->ancestral_state, self->ancestral_state_offset, self->num_rows, keep); if (self->metadata_length > 0) { self->metadata_length = subset_ragged_char_column( self->metadata, self->metadata_offset, self->num_rows, keep); } self->num_rows = remaining_rows; return ret; } static int tsk_site_table_dump(const tsk_site_table_t *self, kastore_t *store, tsk_flags_t options) { const write_table_col_t cols[] = { { "sites/position", (void *) self->position, self->num_rows, KAS_FLOAT64 }, { "sites/metadata_schema", (void *) self->metadata_schema, self->metadata_schema_length, KAS_UINT8 }, { .name = NULL }, }; const write_table_ragged_col_t ragged_cols[] = { { "sites/ancestral_state", (void *) self->ancestral_state, self->ancestral_state_length, KAS_UINT8, self->ancestral_state_offset, self->num_rows }, { "sites/metadata", (void *) self->metadata, self->metadata_length, KAS_UINT8, self->metadata_offset, self->num_rows }, { .name = NULL }, }; return write_table(store, cols, ragged_cols, options); } static int tsk_site_table_load(tsk_site_table_t *self, kastore_t *store) { int ret = 0; char *metadata_schema = NULL; double *position = NULL; char *ancestral_state = NULL; tsk_size_t *ancestral_state_offset = NULL; char *metadata = NULL; tsk_size_t *metadata_offset = NULL; tsk_size_t num_rows, ancestral_state_length, metadata_length, metadata_schema_length; read_table_col_t cols[] = { { "sites/position", (void **) &position, KAS_FLOAT64, 0 }, { .name = NULL }, }; read_table_ragged_col_t ragged_cols[] = { { "sites/ancestral_state", (void **) &ancestral_state, &ancestral_state_length, KAS_UINT8, &ancestral_state_offset, 0 }, { "sites/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, &metadata_offset, 0 }, { .name = NULL }, }; read_table_property_t properties[] = { { "sites/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, { .name = NULL }, }; ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); if (ret != 0) { goto out; } if (metadata_schema != NULL) { ret = tsk_site_table_set_metadata_schema( self, metadata_schema, metadata_schema_length); if (ret != 0) { goto out; } } ret = tsk_site_table_takeset_columns(self, num_rows, position, ancestral_state, ancestral_state_offset, metadata, metadata_offset); if (ret != 0) { goto out; } position = NULL; ancestral_state = NULL; ancestral_state_offset = NULL; metadata = NULL; metadata_offset = NULL; out: free_read_table_mem(cols, ragged_cols, properties); return ret; } /************************* * mutation table *************************/ static void tsk_mutation_table_free_columns(tsk_mutation_table_t *self) { tsk_safe_free(self->node); tsk_safe_free(self->site); tsk_safe_free(self->parent); tsk_safe_free(self->time); tsk_safe_free(self->derived_state); tsk_safe_free(self->derived_state_offset); tsk_safe_free(self->metadata); tsk_safe_free(self->metadata_offset); } int tsk_mutation_table_free(tsk_mutation_table_t *self) { tsk_mutation_table_free_columns(self); tsk_safe_free(self->metadata_schema); return 0; } static int tsk_mutation_table_expand_main_columns( tsk_mutation_table_t *self, tsk_size_t additional_rows) { int ret = 0; tsk_size_t new_max_rows; ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, additional_rows, &new_max_rows); if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { ret = expand_column((void **) &self->site, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->node, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->parent, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->time, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } ret = expand_column( (void **) &self->derived_state_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } ret = expand_column( (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } self->max_rows = new_max_rows; } out: return ret; } static int tsk_mutation_table_expand_derived_state( tsk_mutation_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->derived_state_length, additional_length, self->max_derived_state_length_increment, &self->max_derived_state_length, (void **) &self->derived_state, sizeof(*self->derived_state)); } static int tsk_mutation_table_expand_metadata( tsk_mutation_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->metadata_length, additional_length, self->max_metadata_length_increment, &self->max_metadata_length, (void **) &self->metadata, sizeof(*self->metadata)); } int tsk_mutation_table_set_max_rows_increment( tsk_mutation_table_t *self, tsk_size_t max_rows_increment) { self->max_rows_increment = max_rows_increment; return 0; } int tsk_mutation_table_set_max_metadata_length_increment( tsk_mutation_table_t *self, tsk_size_t max_metadata_length_increment) { self->max_metadata_length_increment = max_metadata_length_increment; return 0; } int tsk_mutation_table_set_max_derived_state_length_increment( tsk_mutation_table_t *self, tsk_size_t max_derived_state_length_increment) { self->max_derived_state_length_increment = max_derived_state_length_increment; return 0; } int tsk_mutation_table_init(tsk_mutation_table_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_memset(self, 0, sizeof(tsk_mutation_table_t)); /* Allocate space for one row initially, ensuring we always have valid pointers * even if the table is empty */ self->max_rows_increment = 1; self->max_derived_state_length_increment = 1; self->max_metadata_length_increment = 1; ret = tsk_mutation_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_mutation_table_expand_derived_state(self, 1); if (ret != 0) { goto out; } ret = tsk_mutation_table_expand_metadata(self, 1); if (ret != 0) { goto out; } self->derived_state_offset[0] = 0; self->metadata_offset[0] = 0; self->max_rows_increment = 0; self->max_derived_state_length_increment = 0; self->max_metadata_length_increment = 0; tsk_mutation_table_set_metadata_schema(self, NULL, 0); out: return ret; } tsk_id_t tsk_mutation_table_add_row(tsk_mutation_table_t *self, tsk_id_t site, tsk_id_t node, tsk_id_t parent, double time, const char *derived_state, tsk_size_t derived_state_length, const char *metadata, tsk_size_t metadata_length) { tsk_id_t ret; tsk_size_t derived_state_offset, metadata_offset; ret = tsk_mutation_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } self->site[self->num_rows] = site; self->node[self->num_rows] = node; self->parent[self->num_rows] = parent; self->time[self->num_rows] = time; derived_state_offset = self->derived_state_length; tsk_bug_assert(self->derived_state_offset[self->num_rows] == derived_state_offset); ret = tsk_mutation_table_expand_derived_state(self, derived_state_length); if (ret != 0) { goto out; } self->derived_state_length += derived_state_length; tsk_memmove( self->derived_state + derived_state_offset, derived_state, derived_state_length); self->derived_state_offset[self->num_rows + 1] = self->derived_state_length; metadata_offset = self->metadata_length; tsk_bug_assert(self->metadata_offset[self->num_rows] == metadata_offset); ret = tsk_mutation_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } self->metadata_length += metadata_length; tsk_memmove(self->metadata + metadata_offset, metadata, metadata_length); self->metadata_offset[self->num_rows + 1] = self->metadata_length; ret = (tsk_id_t) self->num_rows; self->num_rows++; out: return ret; } static int tsk_mutation_table_update_row_rewrite(tsk_mutation_table_t *self, tsk_id_t index, tsk_id_t site, tsk_id_t node, tsk_id_t parent, double time, const char *derived_state, tsk_size_t derived_state_length, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_id_t j, ret_id; tsk_mutation_table_t copy; tsk_size_t num_rows; tsk_id_t *rows = NULL; ret = tsk_mutation_table_copy(self, ©, 0); if (ret != 0) { goto out; } rows = tsk_malloc(self->num_rows * sizeof(*rows)); if (rows == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_mutation_table_truncate(self, (tsk_size_t) index); tsk_bug_assert(ret == 0); ret_id = tsk_mutation_table_add_row(self, site, node, parent, time, derived_state, derived_state_length, metadata, metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } num_rows = 0; for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { rows[num_rows] = j; num_rows++; } ret = tsk_mutation_table_extend(self, ©, num_rows, rows, 0); if (ret != 0) { goto out; } out: tsk_mutation_table_free(©); tsk_safe_free(rows); return ret; } int tsk_mutation_table_update_row(tsk_mutation_table_t *self, tsk_id_t index, tsk_id_t site, tsk_id_t node, tsk_id_t parent, double time, const char *derived_state, tsk_size_t derived_state_length, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_mutation_t current_row; ret = tsk_mutation_table_get_row(self, index, ¤t_row); if (ret != 0) { goto out; } if (current_row.metadata_length == metadata_length && current_row.derived_state_length == derived_state_length) { self->site[index] = site; self->node[index] = node; self->parent[index] = parent; self->time[index] = time; /* Note: important to use tsk_memmove here as we may be provided pointers * to the column memory as input via get_row */ tsk_memmove(&self->derived_state[self->derived_state_offset[index]], derived_state, derived_state_length * sizeof(*derived_state)); tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, metadata_length * sizeof(*metadata)); } else { ret = tsk_mutation_table_update_row_rewrite(self, index, site, node, parent, time, derived_state, derived_state_length, metadata, metadata_length); if (ret != 0) { goto out; } } out: return ret; } int tsk_mutation_table_append_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, const tsk_id_t *site, const tsk_id_t *node, const tsk_id_t *parent, const double *time, const char *derived_state, const tsk_size_t *derived_state_offset, const char *metadata, const tsk_size_t *metadata_offset) { int ret = 0; tsk_size_t j, derived_state_length, metadata_length; if (site == NULL || node == NULL || derived_state == NULL || derived_state_offset == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if ((metadata == NULL) != (metadata_offset == NULL)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = tsk_mutation_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } tsk_memcpy(self->site + self->num_rows, site, num_rows * sizeof(tsk_id_t)); tsk_memcpy(self->node + self->num_rows, node, num_rows * sizeof(tsk_id_t)); if (parent == NULL) { /* If parent is NULL, set all parents to the null mutation */ tsk_memset(self->parent + self->num_rows, 0xff, num_rows * sizeof(tsk_id_t)); } else { tsk_memcpy(self->parent + self->num_rows, parent, num_rows * sizeof(tsk_id_t)); } if (time == NULL) { /* If time is NULL, set all times to TSK_UNKNOWN_TIME which is the * default */ for (j = 0; j < num_rows; j++) { self->time[self->num_rows + j] = TSK_UNKNOWN_TIME; } } else { tsk_memcpy(self->time + self->num_rows, time, num_rows * sizeof(double)); } /* Metadata column */ if (metadata == NULL) { for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j + 1] = self->metadata_length; } } else { ret = check_offsets(num_rows, metadata_offset, 0, false); if (ret != 0) { goto out; } metadata_length = metadata_offset[num_rows]; ret = tsk_mutation_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } tsk_memcpy(self->metadata + self->metadata_length, metadata, metadata_length * sizeof(char)); for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j] = self->metadata_length + metadata_offset[j]; } self->metadata_length += metadata_length; } self->metadata_offset[self->num_rows + num_rows] = self->metadata_length; /* Derived state column */ ret = check_offsets(num_rows, derived_state_offset, 0, false); if (ret != 0) { goto out; } derived_state_length = derived_state_offset[num_rows]; ret = tsk_mutation_table_expand_derived_state(self, derived_state_length); if (ret != 0) { goto out; } tsk_memcpy(self->derived_state + self->derived_state_length, derived_state, derived_state_length * sizeof(char)); for (j = 0; j < num_rows; j++) { self->derived_state_offset[self->num_rows + j] = self->derived_state_length + derived_state_offset[j]; } self->derived_state_length += derived_state_length; self->derived_state_offset[self->num_rows + num_rows] = self->derived_state_length; self->num_rows += num_rows; out: return ret; } int TSK_WARN_UNUSED tsk_mutation_table_takeset_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, double *time, char *derived_state, tsk_size_t *derived_state_offset, char *metadata, tsk_size_t *metadata_offset) { tsk_size_t j; int ret = 0; if (site == NULL || node == NULL || derived_state == NULL || derived_state_offset == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } /* We need to check all the inputs before we start freeing or taking memory */ ret = check_ragged_column(num_rows, derived_state, derived_state_offset); if (ret != 0) { goto out; } ret = check_ragged_column(num_rows, metadata, metadata_offset); if (ret != 0) { goto out; } tsk_mutation_table_free_columns(self); self->num_rows = num_rows; self->max_rows = num_rows; self->site = site; self->node = node; ret = takeset_optional_id_column(num_rows, parent, &self->parent); if (ret != 0) { goto out; } if (time == NULL) { /* Time defaults to unknown time if not specified. */ self->time = tsk_malloc(num_rows * sizeof(*self->time)); if (self->time == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < num_rows; j++) { self->time[j] = TSK_UNKNOWN_TIME; } } else { self->time = time; } ret = takeset_ragged_column(num_rows, derived_state, derived_state_offset, (void *) &self->derived_state, &self->derived_state_offset, &self->derived_state_length); if (ret != 0) { goto out; } ret = takeset_ragged_column(num_rows, metadata, metadata_offset, (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); if (ret != 0) { goto out; } out: return ret; } int TSK_WARN_UNUSED tsk_mutation_table_copy( const tsk_mutation_table_t *self, tsk_mutation_table_t *dest, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_mutation_table_init(dest, 0); if (ret != 0) { goto out; } } ret = tsk_mutation_table_set_columns(dest, self->num_rows, self->site, self->node, self->parent, self->time, self->derived_state, self->derived_state_offset, self->metadata, self->metadata_offset); if (ret != 0) { goto out; } ret = tsk_mutation_table_set_metadata_schema( dest, self->metadata_schema, self->metadata_schema_length); out: return ret; } int tsk_mutation_table_set_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, const tsk_id_t *site, const tsk_id_t *node, const tsk_id_t *parent, const double *time, const char *derived_state, const tsk_size_t *derived_state_offset, const char *metadata, const tsk_size_t *metadata_offset) { int ret = 0; ret = tsk_mutation_table_clear(self); if (ret != 0) { goto out; } ret = tsk_mutation_table_append_columns(self, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); out: return ret; } bool tsk_mutation_table_equals(const tsk_mutation_table_t *self, const tsk_mutation_table_t *other, tsk_flags_t options) { bool ret = self->num_rows == other->num_rows && self->derived_state_length == other->derived_state_length && tsk_memcmp(self->site, other->site, self->num_rows * sizeof(tsk_id_t)) == 0 && tsk_memcmp(self->node, other->node, self->num_rows * sizeof(tsk_id_t)) == 0 && tsk_memcmp(self->parent, other->parent, self->num_rows * sizeof(tsk_id_t)) == 0 && tsk_memcmp(self->time, other->time, self->num_rows * sizeof(double)) == 0 && tsk_memcmp(self->derived_state_offset, other->derived_state_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->derived_state, other->derived_state, self->derived_state_length * sizeof(char)) == 0; if (!(options & TSK_CMP_IGNORE_METADATA)) { ret = ret && self->metadata_length == other->metadata_length && self->metadata_schema_length == other->metadata_schema_length && tsk_memcmp(self->metadata_offset, other->metadata_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->metadata, other->metadata, self->metadata_length * sizeof(char)) == 0 && tsk_memcmp(self->metadata_schema, other->metadata_schema, self->metadata_schema_length * sizeof(char)) == 0 && tsk_memcmp(self->metadata_schema, other->metadata_schema, self->metadata_schema_length * sizeof(char)) == 0; } return ret; } int tsk_mutation_table_clear(tsk_mutation_table_t *self) { return tsk_mutation_table_truncate(self, 0); } int tsk_mutation_table_truncate(tsk_mutation_table_t *mutations, tsk_size_t num_rows) { int ret = 0; if (num_rows > mutations->num_rows) { ret = tsk_trace_error(TSK_ERR_BAD_TABLE_POSITION); goto out; } mutations->num_rows = num_rows; mutations->derived_state_length = mutations->derived_state_offset[num_rows]; mutations->metadata_length = mutations->metadata_offset[num_rows]; out: return ret; } int tsk_mutation_table_extend(tsk_mutation_table_t *self, const tsk_mutation_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_mutation_t mutation; if (self == other) { ret = tsk_trace_error(TSK_ERR_CANNOT_EXTEND_FROM_SELF); goto out; } /* We know how much to expand the non-ragged columns, so do it ahead of time */ ret = tsk_mutation_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { ret = tsk_mutation_table_get_row( other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &mutation); if (ret != 0) { goto out; } ret_id = tsk_mutation_table_add_row(self, mutation.site, mutation.node, mutation.parent, mutation.time, mutation.derived_state, mutation.derived_state_length, mutation.metadata, mutation.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; out: return ret; } void tsk_mutation_table_print_state(const tsk_mutation_table_t *self, FILE *out) { int ret; fprintf(out, "\n" TABLE_SEP); fprintf(out, "mutation_table: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, (long long) self->max_rows_increment); fprintf(out, "derived_state_length = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->derived_state_length, (long long) self->max_derived_state_length, (long long) self->max_derived_state_length_increment); fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->metadata_length, (long long) self->max_metadata_length, (long long) self->max_metadata_length_increment); fprintf(out, TABLE_SEP); ret = tsk_mutation_table_dump_text(self, out); tsk_bug_assert(ret == 0); tsk_bug_assert(self->derived_state_offset[0] == 0); tsk_bug_assert( self->derived_state_length == self->derived_state_offset[self->num_rows]); tsk_bug_assert(self->metadata_offset[0] == 0); tsk_bug_assert(self->metadata_length == self->metadata_offset[self->num_rows]); } static inline void tsk_mutation_table_get_row_unsafe( const tsk_mutation_table_t *self, tsk_id_t index, tsk_mutation_t *row) { row->id = (tsk_id_t) index; row->site = self->site[index]; row->node = self->node[index]; row->parent = self->parent[index]; row->time = self->time[index]; row->derived_state_length = self->derived_state_offset[index + 1] - self->derived_state_offset[index]; row->derived_state = self->derived_state + self->derived_state_offset[index]; row->metadata_length = self->metadata_offset[index + 1] - self->metadata_offset[index]; row->metadata = self->metadata + self->metadata_offset[index]; row->edge = TSK_NULL; } int tsk_mutation_table_get_row( const tsk_mutation_table_t *self, tsk_id_t index, tsk_mutation_t *row) { int ret = 0; if (index < 0 || index >= (tsk_id_t) self->num_rows) { ret = tsk_trace_error(TSK_ERR_MUTATION_OUT_OF_BOUNDS); goto out; } tsk_mutation_table_get_row_unsafe(self, index, row); out: return ret; } int tsk_mutation_table_set_metadata_schema(tsk_mutation_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length) { return replace_string(&self->metadata_schema, &self->metadata_schema_length, metadata_schema, metadata_schema_length); } int tsk_mutation_table_dump_text(const tsk_mutation_table_t *self, FILE *out) { int ret = TSK_ERR_IO; int err; tsk_size_t j, derived_state_len, metadata_len; err = write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); if (err < 0) { goto out; } err = fprintf(out, "id\tsite\tnode\tparent\ttime\tderived_state\tmetadata\n"); if (err < 0) { goto out; } for (j = 0; j < self->num_rows; j++) { derived_state_len = self->derived_state_offset[j + 1] - self->derived_state_offset[j]; metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; err = fprintf(out, "%lld\t%lld\t%lld\t%lld\t%f\t%.*s\t%.*s\n", (long long) j, (long long) self->site[j], (long long) self->node[j], (long long) self->parent[j], self->time[j], (int) derived_state_len, self->derived_state + self->derived_state_offset[j], (int) metadata_len, self->metadata + self->metadata_offset[j]); if (err < 0) { goto out; } } ret = 0; out: return ret; } int tsk_mutation_table_keep_rows(tsk_mutation_table_t *self, const tsk_bool_t *keep, tsk_flags_t TSK_UNUSED(options), tsk_id_t *ret_id_map) { int ret = 0; const tsk_size_t current_num_rows = self->num_rows; tsk_size_t j, remaining_rows; tsk_id_t pj; tsk_id_t *id_map = ret_id_map; tsk_id_t *restrict parent = self->parent; if (ret_id_map == NULL) { id_map = tsk_malloc(current_num_rows * sizeof(*id_map)); if (id_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } } keep_mask_to_id_map(current_num_rows, keep, id_map); /* Note: we could add some options to avoid these checks if we wanted. * MAP_DELETED_TO_NULL is an obvious one, and I guess it might be * helpful to also provide NO_REMAP to prevent reference remapping * entirely. */ for (j = 0; j < current_num_rows; j++) { if (keep[j]) { pj = parent[j]; if (pj != TSK_NULL) { if (pj < 0 || pj >= (tsk_id_t) current_num_rows) { ret = tsk_trace_error(TSK_ERR_MUTATION_OUT_OF_BOUNDS); goto out; } if (id_map[pj] == TSK_NULL) { ret = tsk_trace_error(TSK_ERR_KEEP_ROWS_MAP_TO_DELETED); goto out; } } } } remaining_rows = subset_id_column(self->site, current_num_rows, keep); subset_id_column(self->node, current_num_rows, keep); subset_remap_id_column(parent, current_num_rows, keep, id_map); subset_double_column(self->time, current_num_rows, keep); self->derived_state_length = subset_ragged_char_column( self->derived_state, self->derived_state_offset, current_num_rows, keep); if (self->metadata_length > 0) { self->metadata_length = subset_ragged_char_column( self->metadata, self->metadata_offset, current_num_rows, keep); } self->num_rows = remaining_rows; out: if (ret_id_map == NULL) { tsk_safe_free(id_map); } return ret; } static int tsk_mutation_table_dump( const tsk_mutation_table_t *self, kastore_t *store, tsk_flags_t options) { const write_table_col_t cols[] = { { "mutations/site", (void *) self->site, self->num_rows, TSK_ID_STORAGE_TYPE }, { "mutations/node", (void *) self->node, self->num_rows, TSK_ID_STORAGE_TYPE }, { "mutations/parent", (void *) self->parent, self->num_rows, TSK_ID_STORAGE_TYPE }, { "mutations/time", (void *) self->time, self->num_rows, KAS_FLOAT64 }, { "mutations/metadata_schema", (void *) self->metadata_schema, self->metadata_schema_length, KAS_UINT8 }, { .name = NULL }, }; const write_table_ragged_col_t ragged_cols[] = { { "mutations/derived_state", (void *) self->derived_state, self->derived_state_length, KAS_UINT8, self->derived_state_offset, self->num_rows }, { "mutations/metadata", (void *) self->metadata, self->metadata_length, KAS_UINT8, self->metadata_offset, self->num_rows }, { .name = NULL }, }; return write_table(store, cols, ragged_cols, options); } static int tsk_mutation_table_load(tsk_mutation_table_t *self, kastore_t *store) { int ret = 0; tsk_id_t *node = NULL; tsk_id_t *site = NULL; tsk_id_t *parent = NULL; double *time = NULL; char *derived_state = NULL; tsk_size_t *derived_state_offset = NULL; char *metadata = NULL; tsk_size_t *metadata_offset = NULL; char *metadata_schema = NULL; tsk_size_t num_rows, derived_state_length, metadata_length, metadata_schema_length; read_table_col_t cols[] = { { "mutations/site", (void **) &site, TSK_ID_STORAGE_TYPE, 0 }, { "mutations/node", (void **) &node, TSK_ID_STORAGE_TYPE, 0 }, { "mutations/parent", (void **) &parent, TSK_ID_STORAGE_TYPE, 0 }, { "mutations/time", (void **) &time, KAS_FLOAT64, TSK_COL_OPTIONAL }, { .name = NULL }, }; read_table_ragged_col_t ragged_cols[] = { { "mutations/derived_state", (void **) &derived_state, &derived_state_length, KAS_UINT8, &derived_state_offset, 0 }, { "mutations/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, &metadata_offset, 0 }, { .name = NULL }, }; read_table_property_t properties[] = { { "mutations/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, { .name = NULL }, }; ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); if (ret != 0) { goto out; } if (metadata_schema != NULL) { ret = tsk_mutation_table_set_metadata_schema( self, metadata_schema, metadata_schema_length); if (ret != 0) { goto out; } } ret = tsk_mutation_table_takeset_columns(self, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); if (ret != 0) { goto out; } site = NULL; node = NULL; parent = NULL; time = NULL; derived_state = NULL; derived_state_offset = NULL; metadata = NULL; metadata_offset = NULL; out: free_read_table_mem(cols, ragged_cols, properties); return ret; } /************************* * migration table *************************/ static void tsk_migration_table_free_columns(tsk_migration_table_t *self) { tsk_safe_free(self->left); tsk_safe_free(self->right); tsk_safe_free(self->node); tsk_safe_free(self->source); tsk_safe_free(self->dest); tsk_safe_free(self->time); tsk_safe_free(self->metadata); tsk_safe_free(self->metadata_offset); } int tsk_migration_table_free(tsk_migration_table_t *self) { tsk_migration_table_free_columns(self); tsk_safe_free(self->metadata_schema); return 0; } static int tsk_migration_table_expand_main_columns( tsk_migration_table_t *self, tsk_size_t additional_rows) { int ret = 0; tsk_size_t new_max_rows; ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, additional_rows, &new_max_rows); if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { ret = expand_column((void **) &self->left, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->right, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->node, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->source, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->dest, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } ret = expand_column((void **) &self->time, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } ret = expand_column( (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } self->max_rows = new_max_rows; } out: return ret; } static int tsk_migration_table_expand_metadata( tsk_migration_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->metadata_length, additional_length, self->max_metadata_length_increment, &self->max_metadata_length, (void **) &self->metadata, sizeof(*self->metadata)); } int tsk_migration_table_set_max_rows_increment( tsk_migration_table_t *self, tsk_size_t max_rows_increment) { self->max_rows_increment = max_rows_increment; return 0; } int tsk_migration_table_set_max_metadata_length_increment( tsk_migration_table_t *self, tsk_size_t max_metadata_length_increment) { self->max_metadata_length_increment = max_metadata_length_increment; return 0; } int tsk_migration_table_init(tsk_migration_table_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_memset(self, 0, sizeof(tsk_migration_table_t)); /* Allocate space for one row initially, ensuring we always have valid pointers * even if the table is empty */ self->max_rows_increment = 1; self->max_metadata_length_increment = 1; ret = tsk_migration_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_migration_table_expand_metadata(self, 1); if (ret != 0) { goto out; } self->metadata_offset[0] = 0; self->max_rows_increment = 0; self->max_metadata_length_increment = 0; tsk_migration_table_set_metadata_schema(self, NULL, 0); out: return ret; } int tsk_migration_table_append_columns(tsk_migration_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *node, const tsk_id_t *source, const tsk_id_t *dest, const double *time, const char *metadata, const tsk_size_t *metadata_offset) { int ret; tsk_size_t j, metadata_length; if (left == NULL || right == NULL || node == NULL || source == NULL || dest == NULL || time == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if ((metadata == NULL) != (metadata_offset == NULL)) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = tsk_migration_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } tsk_memcpy(self->left + self->num_rows, left, num_rows * sizeof(double)); tsk_memcpy(self->right + self->num_rows, right, num_rows * sizeof(double)); tsk_memcpy(self->node + self->num_rows, node, num_rows * sizeof(tsk_id_t)); tsk_memcpy(self->source + self->num_rows, source, num_rows * sizeof(tsk_id_t)); tsk_memcpy(self->dest + self->num_rows, dest, num_rows * sizeof(tsk_id_t)); tsk_memcpy(self->time + self->num_rows, time, num_rows * sizeof(double)); if (metadata == NULL) { for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j + 1] = self->metadata_length; } } else { ret = check_offsets(num_rows, metadata_offset, 0, false); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j] = (tsk_size_t) self->metadata_length + metadata_offset[j]; } metadata_length = metadata_offset[num_rows]; ret = tsk_migration_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } tsk_memcpy(self->metadata + self->metadata_length, metadata, metadata_length * sizeof(char)); self->metadata_length += metadata_length; } self->num_rows += num_rows; self->metadata_offset[self->num_rows] = self->metadata_length; out: return ret; } int TSK_WARN_UNUSED tsk_migration_table_takeset_columns(tsk_migration_table_t *self, tsk_size_t num_rows, double *left, double *right, tsk_id_t *node, tsk_id_t *source, tsk_id_t *dest, double *time, char *metadata, tsk_size_t *metadata_offset) { int ret = 0; if (left == NULL || right == NULL || node == NULL || source == NULL || dest == NULL || time == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } /* We need to check all the inputs before we start freeing or taking memory */ ret = check_ragged_column(num_rows, metadata, metadata_offset); if (ret != 0) { goto out; } tsk_migration_table_free_columns(self); self->num_rows = num_rows; self->max_rows = num_rows; self->left = left; self->right = right; self->node = node; self->source = source; self->dest = dest; self->time = time; ret = takeset_ragged_column(num_rows, metadata, metadata_offset, (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); if (ret != 0) { goto out; } out: return ret; } int TSK_WARN_UNUSED tsk_migration_table_copy( const tsk_migration_table_t *self, tsk_migration_table_t *dest, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_migration_table_init(dest, 0); if (ret != 0) { goto out; } } ret = tsk_migration_table_set_columns(dest, self->num_rows, self->left, self->right, self->node, self->source, self->dest, self->time, self->metadata, self->metadata_offset); if (ret != 0) { goto out; } ret = tsk_migration_table_set_metadata_schema( dest, self->metadata_schema, self->metadata_schema_length); out: return ret; } int tsk_migration_table_set_columns(tsk_migration_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *node, const tsk_id_t *source, const tsk_id_t *dest, const double *time, const char *metadata, const tsk_size_t *metadata_offset) { int ret; ret = tsk_migration_table_clear(self); if (ret != 0) { goto out; } ret = tsk_migration_table_append_columns(self, num_rows, left, right, node, source, dest, time, metadata, metadata_offset); out: return ret; } tsk_id_t tsk_migration_table_add_row(tsk_migration_table_t *self, double left, double right, tsk_id_t node, tsk_id_t source, tsk_id_t dest, double time, const char *metadata, tsk_size_t metadata_length) { tsk_id_t ret = 0; ret = tsk_migration_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_migration_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } tsk_bug_assert(self->num_rows < self->max_rows); tsk_bug_assert(self->metadata_length + metadata_length <= self->max_metadata_length); tsk_memmove(self->metadata + self->metadata_length, metadata, metadata_length); self->left[self->num_rows] = left; self->right[self->num_rows] = right; self->node[self->num_rows] = node; self->source[self->num_rows] = source; self->dest[self->num_rows] = dest; self->time[self->num_rows] = time; self->metadata_offset[self->num_rows + 1] = self->metadata_length + metadata_length; self->metadata_length += metadata_length; ret = (tsk_id_t) self->num_rows; self->num_rows++; out: return ret; } static int tsk_migration_table_update_row_rewrite(tsk_migration_table_t *self, tsk_id_t index, double left, double right, tsk_id_t node, tsk_id_t source, tsk_id_t dest, double time, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_id_t j, ret_id; tsk_migration_table_t copy; tsk_size_t num_rows; tsk_id_t *rows = NULL; ret = tsk_migration_table_copy(self, ©, 0); if (ret != 0) { goto out; } rows = tsk_malloc(self->num_rows * sizeof(*rows)); if (rows == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_migration_table_truncate(self, (tsk_size_t) index); tsk_bug_assert(ret == 0); ret_id = tsk_migration_table_add_row( self, left, right, node, source, dest, time, metadata, metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } num_rows = 0; for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { rows[num_rows] = j; num_rows++; } ret = tsk_migration_table_extend(self, ©, num_rows, rows, 0); if (ret != 0) { goto out; } out: tsk_migration_table_free(©); tsk_safe_free(rows); return ret; } int tsk_migration_table_update_row(tsk_migration_table_t *self, tsk_id_t index, double left, double right, tsk_id_t node, tsk_id_t source, tsk_id_t dest, double time, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_migration_t current_row; ret = tsk_migration_table_get_row(self, index, ¤t_row); if (ret != 0) { goto out; } if (current_row.metadata_length == metadata_length) { self->left[index] = left; self->right[index] = right; self->node[index] = node; self->source[index] = source; self->dest[index] = dest; self->time[index] = time; /* Note: important to use tsk_memmove here as we may be provided pointers * to the column memory as input via get_row */ tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, metadata_length * sizeof(*metadata)); } else { ret = tsk_migration_table_update_row_rewrite(self, index, left, right, node, source, dest, time, metadata, metadata_length); if (ret != 0) { goto out; } } out: return ret; } int tsk_migration_table_clear(tsk_migration_table_t *self) { return tsk_migration_table_truncate(self, 0); } int tsk_migration_table_truncate(tsk_migration_table_t *self, tsk_size_t num_rows) { int ret = 0; if (num_rows > self->num_rows) { ret = tsk_trace_error(TSK_ERR_BAD_TABLE_POSITION); goto out; } self->num_rows = num_rows; self->metadata_length = self->metadata_offset[num_rows]; out: return ret; } int tsk_migration_table_extend(tsk_migration_table_t *self, const tsk_migration_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_migration_t migration; if (self == other) { ret = tsk_trace_error(TSK_ERR_CANNOT_EXTEND_FROM_SELF); goto out; } /* We know how much to expand the non-ragged columns, so do it ahead of time */ ret = tsk_migration_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { ret = tsk_migration_table_get_row( other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &migration); if (ret != 0) { goto out; } ret_id = tsk_migration_table_add_row(self, migration.left, migration.right, migration.node, migration.source, migration.dest, migration.time, migration.metadata, migration.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; out: return ret; } void tsk_migration_table_print_state(const tsk_migration_table_t *self, FILE *out) { int ret; fprintf(out, "\n" TABLE_SEP); fprintf(out, "migration_table: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, (long long) self->max_rows_increment); fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->metadata_length, (long long) self->max_metadata_length, (long long) self->max_metadata_length_increment); fprintf(out, TABLE_SEP); ret = tsk_migration_table_dump_text(self, out); tsk_bug_assert(ret == 0); } static inline void tsk_migration_table_get_row_unsafe( const tsk_migration_table_t *self, tsk_id_t index, tsk_migration_t *row) { row->id = (tsk_id_t) index; row->left = self->left[index]; row->right = self->right[index]; row->node = self->node[index]; row->source = self->source[index]; row->dest = self->dest[index]; row->time = self->time[index]; row->metadata_length = self->metadata_offset[index + 1] - self->metadata_offset[index]; row->metadata = self->metadata + self->metadata_offset[index]; } int tsk_migration_table_get_row( const tsk_migration_table_t *self, tsk_id_t index, tsk_migration_t *row) { int ret = 0; if (index < 0 || index >= (tsk_id_t) self->num_rows) { ret = tsk_trace_error(TSK_ERR_MIGRATION_OUT_OF_BOUNDS); goto out; } tsk_migration_table_get_row_unsafe(self, index, row); out: return ret; } int tsk_migration_table_set_metadata_schema(tsk_migration_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length) { return replace_string(&self->metadata_schema, &self->metadata_schema_length, metadata_schema, metadata_schema_length); } int tsk_migration_table_dump_text(const tsk_migration_table_t *self, FILE *out) { tsk_size_t j; int ret = TSK_ERR_IO; tsk_size_t metadata_len; int err; err = write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); if (err < 0) { goto out; } err = fprintf(out, "left\tright\tnode\tsource\tdest\ttime\tmetadata\n"); if (err < 0) { goto out; } for (j = 0; j < self->num_rows; j++) { metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; err = fprintf(out, "%.3f\t%.3f\t%lld\t%lld\t%lld\t%f\t%.*s\n", self->left[j], self->right[j], (long long) self->node[j], (long long) self->source[j], (long long) self->dest[j], self->time[j], (int) metadata_len, self->metadata + self->metadata_offset[j]); if (err < 0) { goto out; } } ret = 0; out: return ret; } bool tsk_migration_table_equals(const tsk_migration_table_t *self, const tsk_migration_table_t *other, tsk_flags_t options) { bool ret = self->num_rows == other->num_rows && tsk_memcmp(self->left, other->left, self->num_rows * sizeof(double)) == 0 && tsk_memcmp(self->right, other->right, self->num_rows * sizeof(double)) == 0 && tsk_memcmp(self->node, other->node, self->num_rows * sizeof(tsk_id_t)) == 0 && tsk_memcmp(self->source, other->source, self->num_rows * sizeof(tsk_id_t)) == 0 && tsk_memcmp(self->dest, other->dest, self->num_rows * sizeof(tsk_id_t)) == 0 && tsk_memcmp(self->time, other->time, self->num_rows * sizeof(double)) == 0; if (!(options & TSK_CMP_IGNORE_METADATA)) { ret = ret && self->metadata_length == other->metadata_length && self->metadata_schema_length == other->metadata_schema_length && tsk_memcmp(self->metadata_offset, other->metadata_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->metadata, other->metadata, self->metadata_length * sizeof(char)) == 0 && tsk_memcmp(self->metadata_schema, other->metadata_schema, self->metadata_schema_length * sizeof(char)) == 0; } return ret; } int tsk_migration_table_keep_rows(tsk_migration_table_t *self, const tsk_bool_t *keep, tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) { int ret = 0; tsk_size_t remaining_rows; if (id_map != NULL) { keep_mask_to_id_map(self->num_rows, keep, id_map); } remaining_rows = subset_double_column(self->left, self->num_rows, keep); subset_double_column(self->right, self->num_rows, keep); subset_id_column(self->node, self->num_rows, keep); subset_id_column(self->source, self->num_rows, keep); subset_id_column(self->dest, self->num_rows, keep); subset_double_column(self->time, self->num_rows, keep); if (self->metadata_length > 0) { self->metadata_length = subset_ragged_char_column( self->metadata, self->metadata_offset, self->num_rows, keep); } self->num_rows = remaining_rows; return ret; } static int tsk_migration_table_dump( const tsk_migration_table_t *self, kastore_t *store, tsk_flags_t options) { const write_table_col_t cols[] = { { "migrations/left", (void *) self->left, self->num_rows, KAS_FLOAT64 }, { "migrations/right", (void *) self->right, self->num_rows, KAS_FLOAT64 }, { "migrations/node", (void *) self->node, self->num_rows, TSK_ID_STORAGE_TYPE }, { "migrations/source", (void *) self->source, self->num_rows, TSK_ID_STORAGE_TYPE }, { "migrations/dest", (void *) self->dest, self->num_rows, TSK_ID_STORAGE_TYPE }, { "migrations/time", (void *) self->time, self->num_rows, KAS_FLOAT64 }, { "migrations/metadata_schema", (void *) self->metadata_schema, self->metadata_schema_length, KAS_UINT8 }, { .name = NULL }, }; const write_table_ragged_col_t ragged_cols[] = { { "migrations/metadata", (void *) self->metadata, self->metadata_length, KAS_UINT8, self->metadata_offset, self->num_rows }, { .name = NULL }, }; return write_table(store, cols, ragged_cols, options); } static int tsk_migration_table_load(tsk_migration_table_t *self, kastore_t *store) { int ret = 0; tsk_id_t *source = NULL; tsk_id_t *dest = NULL; tsk_id_t *node = NULL; double *left = NULL; double *right = NULL; double *time = NULL; char *metadata = NULL; tsk_size_t *metadata_offset = NULL; char *metadata_schema = NULL; tsk_size_t num_rows, metadata_length, metadata_schema_length; read_table_col_t cols[] = { { "migrations/left", (void **) &left, KAS_FLOAT64, 0 }, { "migrations/right", (void **) &right, KAS_FLOAT64, 0 }, { "migrations/node", (void **) &node, TSK_ID_STORAGE_TYPE, 0 }, { "migrations/source", (void **) &source, TSK_ID_STORAGE_TYPE, 0 }, { "migrations/dest", (void **) &dest, TSK_ID_STORAGE_TYPE, 0 }, { "migrations/time", (void **) &time, KAS_FLOAT64, 0 }, { .name = NULL }, }; read_table_ragged_col_t ragged_cols[] = { { "migrations/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, &metadata_offset, TSK_COL_OPTIONAL }, { .name = NULL }, }; read_table_property_t properties[] = { { "migrations/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, { .name = NULL }, }; ret = read_table(store, &num_rows, cols, ragged_cols, properties, 0); if (ret != 0) { goto out; } if (metadata_schema != NULL) { ret = tsk_migration_table_set_metadata_schema( self, metadata_schema, metadata_schema_length); if (ret != 0) { goto out; } } ret = tsk_migration_table_takeset_columns(self, num_rows, left, right, node, source, dest, time, metadata, metadata_offset); if (ret != 0) { goto out; } left = NULL; right = NULL; node = NULL; source = NULL; dest = NULL; time = NULL; metadata = NULL; metadata_offset = NULL; out: free_read_table_mem(cols, ragged_cols, properties); return ret; } /************************* * population table *************************/ static void tsk_population_table_free_columns(tsk_population_table_t *self) { tsk_safe_free(self->metadata); tsk_safe_free(self->metadata_offset); } int tsk_population_table_free(tsk_population_table_t *self) { tsk_population_table_free_columns(self); tsk_safe_free(self->metadata_schema); return 0; } static int tsk_population_table_expand_main_columns( tsk_population_table_t *self, tsk_size_t additional_rows) { int ret = 0; tsk_size_t new_max_rows; ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, additional_rows, &new_max_rows); if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { ret = expand_column( (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } self->max_rows = new_max_rows; } out: return ret; } static int tsk_population_table_expand_metadata( tsk_population_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->metadata_length, additional_length, self->max_metadata_length_increment, &self->max_metadata_length, (void **) &self->metadata, sizeof(*self->metadata)); } int tsk_population_table_set_max_rows_increment( tsk_population_table_t *self, tsk_size_t max_rows_increment) { self->max_rows_increment = max_rows_increment; return 0; } int tsk_population_table_set_max_metadata_length_increment( tsk_population_table_t *self, tsk_size_t max_metadata_length_increment) { self->max_metadata_length_increment = max_metadata_length_increment; return 0; } int tsk_population_table_init(tsk_population_table_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_memset(self, 0, sizeof(tsk_population_table_t)); /* Allocate space for one row initially, ensuring we always have valid pointers * even if the table is empty */ self->max_rows_increment = 1; self->max_metadata_length_increment = 1; ret = tsk_population_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_population_table_expand_metadata(self, 1); if (ret != 0) { goto out; } self->metadata_offset[0] = 0; self->max_rows_increment = 0; self->max_metadata_length_increment = 0; tsk_population_table_set_metadata_schema(self, NULL, 0); out: return ret; } int TSK_WARN_UNUSED tsk_population_table_copy(const tsk_population_table_t *self, tsk_population_table_t *dest, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_population_table_init(dest, 0); if (ret != 0) { goto out; } } ret = tsk_population_table_set_columns( dest, self->num_rows, self->metadata, self->metadata_offset); if (ret != 0) { goto out; } ret = tsk_population_table_set_metadata_schema( dest, self->metadata_schema, self->metadata_schema_length); out: return ret; } int tsk_population_table_set_columns(tsk_population_table_t *self, tsk_size_t num_rows, const char *metadata, const tsk_size_t *metadata_offset) { int ret; ret = tsk_population_table_clear(self); if (ret != 0) { goto out; } ret = tsk_population_table_append_columns(self, num_rows, metadata, metadata_offset); out: return ret; } int tsk_population_table_append_columns(tsk_population_table_t *self, tsk_size_t num_rows, const char *metadata, const tsk_size_t *metadata_offset) { int ret; tsk_size_t j, metadata_length; if (metadata == NULL || metadata_offset == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = tsk_population_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } ret = check_offsets(num_rows, metadata_offset, 0, false); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { self->metadata_offset[self->num_rows + j] = self->metadata_length + metadata_offset[j]; } metadata_length = metadata_offset[num_rows]; ret = tsk_population_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } tsk_memcpy(self->metadata + self->metadata_length, metadata, metadata_length * sizeof(char)); self->metadata_length += metadata_length; self->num_rows += num_rows; self->metadata_offset[self->num_rows] = self->metadata_length; out: return ret; } int tsk_population_table_takeset_columns(tsk_population_table_t *self, tsk_size_t num_rows, char *metadata, tsk_size_t *metadata_offset) { int ret = 0; /* We need to check all the inputs before we start freeing or taking memory */ if (metadata == NULL || metadata_offset == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = check_ragged_column(num_rows, metadata, metadata_offset); if (ret != 0) { goto out; } tsk_population_table_free_columns(self); self->num_rows = num_rows; self->max_rows = num_rows; ret = takeset_ragged_column(num_rows, metadata, metadata_offset, (void *) &self->metadata, &self->metadata_offset, &self->metadata_length); if (ret != 0) { goto out; } out: return ret; } static tsk_id_t tsk_population_table_add_row_internal( tsk_population_table_t *self, const char *metadata, tsk_size_t metadata_length) { tsk_id_t ret = 0; tsk_bug_assert(self->num_rows < self->max_rows); tsk_bug_assert(self->metadata_length + metadata_length <= self->max_metadata_length); tsk_memmove(self->metadata + self->metadata_length, metadata, metadata_length); self->metadata_offset[self->num_rows + 1] = self->metadata_length + metadata_length; self->metadata_length += metadata_length; ret = (tsk_id_t) self->num_rows; self->num_rows++; return ret; } tsk_id_t tsk_population_table_add_row( tsk_population_table_t *self, const char *metadata, tsk_size_t metadata_length) { tsk_id_t ret = 0; ret = tsk_population_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_population_table_expand_metadata(self, metadata_length); if (ret != 0) { goto out; } ret = tsk_population_table_add_row_internal(self, metadata, metadata_length); out: return ret; } static int tsk_population_table_update_row_rewrite(tsk_population_table_t *self, tsk_id_t index, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_id_t j, ret_id; tsk_population_table_t copy; tsk_size_t num_rows; tsk_id_t *rows = NULL; ret = tsk_population_table_copy(self, ©, 0); if (ret != 0) { goto out; } rows = tsk_malloc(self->num_rows * sizeof(*rows)); if (rows == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_population_table_truncate(self, (tsk_size_t) index); tsk_bug_assert(ret == 0); ret_id = tsk_population_table_add_row(self, metadata, metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } num_rows = 0; for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { rows[num_rows] = j; num_rows++; } ret = tsk_population_table_extend(self, ©, num_rows, rows, 0); if (ret != 0) { goto out; } out: tsk_population_table_free(©); tsk_safe_free(rows); return ret; } int tsk_population_table_update_row(tsk_population_table_t *self, tsk_id_t index, const char *metadata, tsk_size_t metadata_length) { int ret = 0; tsk_population_t current_row; ret = tsk_population_table_get_row(self, index, ¤t_row); if (ret != 0) { goto out; } if (current_row.metadata_length == metadata_length) { /* Note: important to use tsk_memmove here as we may be provided pointers * to the column memory as input via get_row */ tsk_memmove(&self->metadata[self->metadata_offset[index]], metadata, metadata_length * sizeof(*metadata)); } else { ret = tsk_population_table_update_row_rewrite( self, index, metadata, metadata_length); if (ret != 0) { goto out; } } out: return ret; } int tsk_population_table_clear(tsk_population_table_t *self) { return tsk_population_table_truncate(self, 0); } int tsk_population_table_truncate(tsk_population_table_t *self, tsk_size_t num_rows) { int ret = 0; if (num_rows > self->num_rows) { ret = tsk_trace_error(TSK_ERR_BAD_TABLE_POSITION); goto out; } self->num_rows = num_rows; self->metadata_length = self->metadata_offset[num_rows]; out: return ret; } int tsk_population_table_extend(tsk_population_table_t *self, const tsk_population_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_population_t population; if (self == other) { ret = tsk_trace_error(TSK_ERR_CANNOT_EXTEND_FROM_SELF); goto out; } /* We know how much to expand the non-ragged columns, so do it ahead of time */ ret = tsk_population_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { ret = tsk_population_table_get_row( other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &population); if (ret != 0) { goto out; } ret_id = tsk_population_table_add_row( self, population.metadata, population.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; out: return ret; } void tsk_population_table_print_state(const tsk_population_table_t *self, FILE *out) { tsk_size_t j, k; fprintf(out, "\n" TABLE_SEP); fprintf(out, "population_table: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, (long long) self->max_rows_increment); fprintf(out, "metadata_length = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->metadata_length, (long long) self->max_metadata_length, (long long) self->max_metadata_length_increment); fprintf(out, TABLE_SEP); write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); fprintf(out, "index\tmetadata_offset\tmetadata\n"); for (j = 0; j < self->num_rows; j++) { fprintf( out, "%lld\t%lld\t", (long long) j, (long long) self->metadata_offset[j]); for (k = self->metadata_offset[j]; k < self->metadata_offset[j + 1]; k++) { fprintf(out, "%c", self->metadata[k]); } fprintf(out, "\n"); } tsk_bug_assert(self->metadata_offset[0] == 0); tsk_bug_assert(self->metadata_offset[self->num_rows] == self->metadata_length); } static inline void tsk_population_table_get_row_unsafe( const tsk_population_table_t *self, tsk_id_t index, tsk_population_t *row) { row->id = (tsk_id_t) index; row->metadata_length = self->metadata_offset[index + 1] - self->metadata_offset[index]; row->metadata = self->metadata + self->metadata_offset[index]; } int tsk_population_table_get_row( const tsk_population_table_t *self, tsk_id_t index, tsk_population_t *row) { int ret = 0; if (index < 0 || index >= (tsk_id_t) self->num_rows) { ret = tsk_trace_error(TSK_ERR_POPULATION_OUT_OF_BOUNDS); goto out; } tsk_population_table_get_row_unsafe(self, index, row); out: return ret; } int tsk_population_table_set_metadata_schema(tsk_population_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length) { return replace_string(&self->metadata_schema, &self->metadata_schema_length, metadata_schema, metadata_schema_length); } int tsk_population_table_dump_text(const tsk_population_table_t *self, FILE *out) { int ret = TSK_ERR_IO; int err; tsk_size_t j; tsk_size_t metadata_len; err = write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); if (err < 0) { goto out; } err = fprintf(out, "metadata\n"); if (err < 0) { goto out; } for (j = 0; j < self->num_rows; j++) { metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; err = fprintf(out, "%.*s\n", (int) metadata_len, self->metadata + self->metadata_offset[j]); if (err < 0) { goto out; } } ret = 0; out: return ret; } bool tsk_population_table_equals(const tsk_population_table_t *self, const tsk_population_table_t *other, tsk_flags_t options) { /* Since we only have the metadata column in the table currently, equality * reduces to comparing the number of rows if we disable metadata comparison. */ bool ret = self->num_rows == other->num_rows; if (!(options & TSK_CMP_IGNORE_METADATA)) { ret = ret && self->metadata_length == other->metadata_length && self->metadata_schema_length == other->metadata_schema_length && tsk_memcmp(self->metadata_offset, other->metadata_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->metadata, other->metadata, self->metadata_length * sizeof(char)) == 0 && tsk_memcmp(self->metadata_schema, other->metadata_schema, self->metadata_schema_length * sizeof(char)) == 0; } return ret; } int tsk_population_table_keep_rows(tsk_population_table_t *self, const tsk_bool_t *keep, tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) { int ret = 0; if (id_map != NULL) { keep_mask_to_id_map(self->num_rows, keep, id_map); } if (self->metadata_length > 0) { self->metadata_length = subset_ragged_char_column( self->metadata, self->metadata_offset, self->num_rows, keep); } self->num_rows = count_true(self->num_rows, keep); return ret; } static int tsk_population_table_dump( const tsk_population_table_t *self, kastore_t *store, tsk_flags_t options) { const write_table_col_t cols[] = { { "populations/metadata_schema", (void *) self->metadata_schema, self->metadata_schema_length, KAS_UINT8 }, { .name = NULL }, }; const write_table_ragged_col_t ragged_cols[] = { { "populations/metadata", (void *) self->metadata, self->metadata_length, KAS_UINT8, self->metadata_offset, self->num_rows }, { .name = NULL }, }; return write_table(store, cols, ragged_cols, options); } static int tsk_population_table_load(tsk_population_table_t *self, kastore_t *store) { int ret = 0; char *metadata = NULL; tsk_size_t *metadata_offset = NULL; char *metadata_schema = NULL; tsk_size_t num_rows, metadata_length, metadata_schema_length; read_table_ragged_col_t ragged_cols[] = { { "populations/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, &metadata_offset, 0 }, { .name = NULL }, }; read_table_property_t properties[] = { { "populations/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, { .name = NULL }, }; ret = read_table(store, &num_rows, NULL, ragged_cols, properties, 0); if (ret != 0) { goto out; } if (metadata_schema != NULL) { ret = tsk_population_table_set_metadata_schema( self, metadata_schema, metadata_schema_length); if (ret != 0) { goto out; } } ret = tsk_population_table_takeset_columns( self, num_rows, metadata, metadata_offset); if (ret != 0) { goto out; } metadata = NULL; metadata_offset = NULL; out: free_read_table_mem(NULL, ragged_cols, properties); return ret; } /************************* * provenance table *************************/ static void tsk_provenance_table_free_columns(tsk_provenance_table_t *self) { tsk_safe_free(self->timestamp); tsk_safe_free(self->timestamp_offset); tsk_safe_free(self->record); tsk_safe_free(self->record_offset); } int tsk_provenance_table_free(tsk_provenance_table_t *self) { tsk_provenance_table_free_columns(self); return 0; } static int tsk_provenance_table_expand_main_columns( tsk_provenance_table_t *self, tsk_size_t additional_rows) { int ret = 0; tsk_size_t new_max_rows; ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, additional_rows, &new_max_rows); if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { ret = expand_column( (void **) &self->timestamp_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } ret = expand_column( (void **) &self->record_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } self->max_rows = new_max_rows; } out: return ret; } static int tsk_provenance_table_expand_timestamp( tsk_provenance_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->timestamp_length, additional_length, self->max_timestamp_length_increment, &self->max_timestamp_length, (void **) &self->timestamp, sizeof(*self->timestamp)); } static int tsk_provenance_table_expand_record( tsk_provenance_table_t *self, tsk_size_t additional_length) { return expand_ragged_column(self->record_length, additional_length, self->max_record_length_increment, &self->max_record_length, (void **) &self->record, sizeof(*self->record)); } int tsk_provenance_table_set_max_rows_increment( tsk_provenance_table_t *self, tsk_size_t max_rows_increment) { self->max_rows_increment = max_rows_increment; return 0; } int tsk_provenance_table_set_max_timestamp_length_increment( tsk_provenance_table_t *self, tsk_size_t max_timestamp_length_increment) { self->max_timestamp_length_increment = max_timestamp_length_increment; return 0; } int tsk_provenance_table_set_max_record_length_increment( tsk_provenance_table_t *self, tsk_size_t max_record_length_increment) { self->max_record_length_increment = max_record_length_increment; return 0; } int tsk_provenance_table_init(tsk_provenance_table_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_memset(self, 0, sizeof(tsk_provenance_table_t)); /* Allocate space for one row initially, ensuring we always have valid pointers * even if the table is empty */ self->max_rows_increment = 1; self->max_timestamp_length_increment = 1; self->max_record_length_increment = 1; ret = tsk_provenance_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_provenance_table_expand_timestamp(self, 1); if (ret != 0) { goto out; } self->timestamp_offset[0] = 0; ret = tsk_provenance_table_expand_record(self, 1); if (ret != 0) { goto out; } self->record_offset[0] = 0; self->max_rows_increment = 0; self->max_timestamp_length_increment = 0; self->max_record_length_increment = 0; out: return ret; } int TSK_WARN_UNUSED tsk_provenance_table_copy(const tsk_provenance_table_t *self, tsk_provenance_table_t *dest, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_provenance_table_init(dest, 0); if (ret != 0) { goto out; } } ret = tsk_provenance_table_set_columns(dest, self->num_rows, self->timestamp, self->timestamp_offset, self->record, self->record_offset); out: return ret; } int tsk_provenance_table_set_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, const char *timestamp, const tsk_size_t *timestamp_offset, const char *record, const tsk_size_t *record_offset) { int ret; ret = tsk_provenance_table_clear(self); if (ret != 0) { goto out; } ret = tsk_provenance_table_append_columns( self, num_rows, timestamp, timestamp_offset, record, record_offset); out: return ret; } int tsk_provenance_table_append_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, const char *timestamp, const tsk_size_t *timestamp_offset, const char *record, const tsk_size_t *record_offset) { int ret; tsk_size_t j, timestamp_length, record_length; if (timestamp == NULL || timestamp_offset == NULL || record == NULL || record_offset == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = tsk_provenance_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } ret = check_offsets(num_rows, timestamp_offset, 0, false); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { self->timestamp_offset[self->num_rows + j] = self->timestamp_length + timestamp_offset[j]; } timestamp_length = timestamp_offset[num_rows]; ret = tsk_provenance_table_expand_timestamp(self, timestamp_length); if (ret != 0) { goto out; } tsk_memcpy(self->timestamp + self->timestamp_length, timestamp, timestamp_length * sizeof(char)); self->timestamp_length += timestamp_length; ret = check_offsets(num_rows, record_offset, 0, false); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { self->record_offset[self->num_rows + j] = self->record_length + record_offset[j]; } record_length = record_offset[num_rows]; ret = tsk_provenance_table_expand_record(self, record_length); if (ret != 0) { goto out; } tsk_memcpy(self->record + self->record_length, record, record_length * sizeof(char)); self->record_length += record_length; self->num_rows += num_rows; self->timestamp_offset[self->num_rows] = self->timestamp_length; self->record_offset[self->num_rows] = self->record_length; out: return ret; } int tsk_provenance_table_takeset_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, char *timestamp, tsk_size_t *timestamp_offset, char *record, tsk_size_t *record_offset) { int ret = 0; /* We need to check all the inputs before we start freeing or taking memory */ if (timestamp == NULL || timestamp_offset == NULL || record == NULL || record_offset == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } ret = check_ragged_column(num_rows, timestamp, timestamp_offset); if (ret != 0) { goto out; } ret = check_ragged_column(num_rows, record, record_offset); if (ret != 0) { goto out; } tsk_provenance_table_free_columns(self); self->num_rows = num_rows; self->max_rows = num_rows; ret = takeset_ragged_column(num_rows, timestamp, timestamp_offset, (void *) &self->timestamp, &self->timestamp_offset, &self->timestamp_length); if (ret != 0) { goto out; } ret = takeset_ragged_column(num_rows, record, record_offset, (void *) &self->record, &self->record_offset, &self->record_length); if (ret != 0) { goto out; } out: return ret; } static tsk_id_t tsk_provenance_table_add_row_internal(tsk_provenance_table_t *self, const char *timestamp, tsk_size_t timestamp_length, const char *record, tsk_size_t record_length) { tsk_id_t ret = 0; tsk_bug_assert(self->num_rows < self->max_rows); tsk_bug_assert( self->timestamp_length + timestamp_length <= self->max_timestamp_length); tsk_memmove(self->timestamp + self->timestamp_length, timestamp, timestamp_length); self->timestamp_offset[self->num_rows + 1] = self->timestamp_length + timestamp_length; self->timestamp_length += timestamp_length; tsk_bug_assert(self->record_length + record_length <= self->max_record_length); tsk_memmove(self->record + self->record_length, record, record_length); self->record_offset[self->num_rows + 1] = self->record_length + record_length; self->record_length += record_length; ret = (tsk_id_t) self->num_rows; self->num_rows++; return ret; } tsk_id_t tsk_provenance_table_add_row(tsk_provenance_table_t *self, const char *timestamp, tsk_size_t timestamp_length, const char *record, tsk_size_t record_length) { tsk_id_t ret = 0; ret = tsk_provenance_table_expand_main_columns(self, 1); if (ret != 0) { goto out; } ret = tsk_provenance_table_expand_timestamp(self, timestamp_length); if (ret != 0) { goto out; } ret = tsk_provenance_table_expand_record(self, record_length); if (ret != 0) { goto out; } ret = tsk_provenance_table_add_row_internal( self, timestamp, timestamp_length, record, record_length); out: return ret; } static int tsk_provenance_table_update_row_rewrite(tsk_provenance_table_t *self, tsk_id_t index, const char *timestamp, tsk_size_t timestamp_length, const char *record, tsk_size_t record_length) { int ret = 0; tsk_id_t j, ret_id; tsk_provenance_table_t copy; tsk_size_t num_rows; tsk_id_t *rows = NULL; ret = tsk_provenance_table_copy(self, ©, 0); if (ret != 0) { goto out; } rows = tsk_malloc(self->num_rows * sizeof(*rows)); if (rows == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_provenance_table_truncate(self, (tsk_size_t) index); tsk_bug_assert(ret == 0); ret_id = tsk_provenance_table_add_row( self, timestamp, timestamp_length, record, record_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } num_rows = 0; for (j = index + 1; j < (tsk_id_t) copy.num_rows; j++) { rows[num_rows] = j; num_rows++; } ret = tsk_provenance_table_extend(self, ©, num_rows, rows, 0); if (ret != 0) { goto out; } out: tsk_provenance_table_free(©); tsk_safe_free(rows); return ret; } int tsk_provenance_table_update_row(tsk_provenance_table_t *self, tsk_id_t index, const char *timestamp, tsk_size_t timestamp_length, const char *record, tsk_size_t record_length) { int ret = 0; tsk_provenance_t current_row; ret = tsk_provenance_table_get_row(self, index, ¤t_row); if (ret != 0) { goto out; } if (current_row.timestamp_length == timestamp_length && current_row.record_length == record_length) { /* Note: important to use tsk_memmove here as we may be provided pointers * to the column memory as input via get_row */ tsk_memmove(&self->timestamp[self->timestamp_offset[index]], timestamp, timestamp_length * sizeof(*timestamp)); tsk_memmove(&self->record[self->record_offset[index]], record, record_length * sizeof(*record)); } else { ret = tsk_provenance_table_update_row_rewrite( self, index, timestamp, timestamp_length, record, record_length); if (ret != 0) { goto out; } } out: return ret; } int tsk_provenance_table_clear(tsk_provenance_table_t *self) { return tsk_provenance_table_truncate(self, 0); } int tsk_provenance_table_truncate(tsk_provenance_table_t *self, tsk_size_t num_rows) { int ret = 0; if (num_rows > self->num_rows) { ret = tsk_trace_error(TSK_ERR_BAD_TABLE_POSITION); goto out; } self->num_rows = num_rows; self->timestamp_length = self->timestamp_offset[num_rows]; self->record_length = self->record_offset[num_rows]; out: return ret; } int tsk_provenance_table_extend(tsk_provenance_table_t *self, const tsk_provenance_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_provenance_t provenance; if (self == other) { ret = tsk_trace_error(TSK_ERR_CANNOT_EXTEND_FROM_SELF); goto out; } /* We know how much to expand the non-ragged columns, so do it ahead of time */ ret = tsk_provenance_table_expand_main_columns(self, num_rows); if (ret != 0) { goto out; } for (j = 0; j < num_rows; j++) { ret = tsk_provenance_table_get_row( other, row_indexes == NULL ? (tsk_id_t) j : row_indexes[j], &provenance); if (ret != 0) { goto out; } ret_id = tsk_provenance_table_add_row(self, provenance.timestamp, provenance.timestamp_length, provenance.record, provenance.record_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; out: return ret; } void tsk_provenance_table_print_state(const tsk_provenance_table_t *self, FILE *out) { tsk_size_t j, k; fprintf(out, "\n" TABLE_SEP); fprintf(out, "provenance_table: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, (long long) self->max_rows_increment); fprintf(out, "timestamp_length = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->timestamp_length, (long long) self->max_timestamp_length, (long long) self->max_timestamp_length_increment); fprintf(out, "record_length = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->record_length, (long long) self->max_record_length, (long long) self->max_record_length_increment); fprintf(out, TABLE_SEP); fprintf(out, "index\ttimestamp_offset\ttimestamp\trecord_offset\tprovenance\n"); for (j = 0; j < self->num_rows; j++) { fprintf( out, "%lld\t%lld\t", (long long) j, (long long) self->timestamp_offset[j]); for (k = self->timestamp_offset[j]; k < self->timestamp_offset[j + 1]; k++) { fprintf(out, "%c", self->timestamp[k]); } fprintf(out, "\t%lld\t", (long long) self->record_offset[j]); for (k = self->record_offset[j]; k < self->record_offset[j + 1]; k++) { fprintf(out, "%c", self->record[k]); } fprintf(out, "\n"); } tsk_bug_assert(self->timestamp_offset[0] == 0); tsk_bug_assert(self->timestamp_offset[self->num_rows] == self->timestamp_length); tsk_bug_assert(self->record_offset[0] == 0); tsk_bug_assert(self->record_offset[self->num_rows] == self->record_length); } static inline void tsk_provenance_table_get_row_unsafe( const tsk_provenance_table_t *self, tsk_id_t index, tsk_provenance_t *row) { row->id = (tsk_id_t) index; row->timestamp_length = self->timestamp_offset[index + 1] - self->timestamp_offset[index]; row->timestamp = self->timestamp + self->timestamp_offset[index]; row->record_length = self->record_offset[index + 1] - self->record_offset[index]; row->record = self->record + self->record_offset[index]; } int tsk_provenance_table_get_row( const tsk_provenance_table_t *self, tsk_id_t index, tsk_provenance_t *row) { int ret = 0; if (index < 0 || index >= (tsk_id_t) self->num_rows) { ret = tsk_trace_error(TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); goto out; } tsk_provenance_table_get_row_unsafe(self, index, row); out: return ret; } int tsk_provenance_table_dump_text(const tsk_provenance_table_t *self, FILE *out) { int ret = TSK_ERR_IO; int err; tsk_size_t j, timestamp_len, record_len; err = fprintf(out, "record\ttimestamp\n"); if (err < 0) { goto out; } for (j = 0; j < self->num_rows; j++) { record_len = self->record_offset[j + 1] - self->record_offset[j]; timestamp_len = self->timestamp_offset[j + 1] - self->timestamp_offset[j]; err = fprintf(out, "%.*s\t%.*s\n", (int) record_len, self->record + self->record_offset[j], (int) timestamp_len, self->timestamp + self->timestamp_offset[j]); if (err < 0) { goto out; } } ret = 0; out: return ret; } bool tsk_provenance_table_equals(const tsk_provenance_table_t *self, const tsk_provenance_table_t *other, tsk_flags_t options) { bool ret = self->num_rows == other->num_rows && self->record_length == other->record_length && tsk_memcmp(self->record_offset, other->record_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->record, other->record, self->record_length * sizeof(char)) == 0; if (!(options & TSK_CMP_IGNORE_TIMESTAMPS)) { ret = ret && self->timestamp_length == other->timestamp_length && tsk_memcmp(self->timestamp_offset, other->timestamp_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 && tsk_memcmp(self->timestamp, other->timestamp, self->timestamp_length * sizeof(char)) == 0; } return ret; } int tsk_provenance_table_keep_rows(tsk_provenance_table_t *self, const tsk_bool_t *keep, tsk_flags_t TSK_UNUSED(options), tsk_id_t *id_map) { int ret = 0; if (id_map != NULL) { keep_mask_to_id_map(self->num_rows, keep, id_map); } self->timestamp_length = subset_ragged_char_column( self->timestamp, self->timestamp_offset, self->num_rows, keep); self->record_length = subset_ragged_char_column( self->record, self->record_offset, self->num_rows, keep); self->num_rows = count_true(self->num_rows, keep); return ret; } static int tsk_provenance_table_dump( const tsk_provenance_table_t *self, kastore_t *store, tsk_flags_t options) { write_table_ragged_col_t ragged_cols[] = { { "provenances/timestamp", (void *) self->timestamp, self->timestamp_length, KAS_UINT8, self->timestamp_offset, self->num_rows }, { "provenances/record", (void *) self->record, self->record_length, KAS_UINT8, self->record_offset, self->num_rows }, { .name = NULL }, }; return write_table_ragged_cols(store, ragged_cols, options); } static int tsk_provenance_table_load(tsk_provenance_table_t *self, kastore_t *store) { int ret; char *timestamp = NULL; tsk_size_t *timestamp_offset = NULL; char *record = NULL; tsk_size_t *record_offset = NULL; tsk_size_t num_rows, timestamp_length, record_length; read_table_ragged_col_t ragged_cols[] = { { "provenances/timestamp", (void **) ×tamp, ×tamp_length, KAS_UINT8, ×tamp_offset, 0 }, { "provenances/record", (void **) &record, &record_length, KAS_UINT8, &record_offset, 0 }, { .name = NULL }, }; ret = read_table(store, &num_rows, NULL, ragged_cols, NULL, 0); if (ret != 0) { goto out; } ret = tsk_provenance_table_takeset_columns( self, num_rows, timestamp, timestamp_offset, record, record_offset); if (ret != 0) { goto out; } timestamp = NULL; timestamp_offset = NULL; record = NULL; record_offset = NULL; out: free_read_table_mem(NULL, ragged_cols, NULL); return ret; } /************************* * sort_tables *************************/ typedef struct { double left; double right; tsk_id_t parent; tsk_id_t child; double time; /* It would be a little bit more convenient to store a pointer to the * metadata here in the struct rather than an offset back into the * original array. However, this would increase the size of the struct * from 40 bytes to 48 and we will allocate very large numbers of these. */ tsk_size_t metadata_offset; tsk_size_t metadata_length; } edge_sort_t; typedef struct { tsk_mutation_t mut; int num_descendants; double node_time; } mutation_sort_t; typedef struct { tsk_individual_t ind; tsk_id_t first_node; tsk_size_t num_descendants; } individual_canonical_sort_t; typedef struct { double left; double right; tsk_id_t node; tsk_id_t source; tsk_id_t dest; double time; tsk_size_t metadata_offset; tsk_size_t metadata_length; } migration_sort_t; static int cmp_site(const void *a, const void *b) { const tsk_site_t *ia = (const tsk_site_t *) a; const tsk_site_t *ib = (const tsk_site_t *) b; /* Compare sites by position */ int ret = (ia->position > ib->position) - (ia->position < ib->position); if (ret == 0) { /* Within a particular position sort by ID. This ensures that relative * ordering of multiple sites at the same position is maintained; the * redundant sites will get compacted down by clean_tables(), but in the * meantime if the order of the redundant sites changes it will cause the * sort order of mutations to be corrupted, as the mutations will follow * their sites. */ ret = (ia->id > ib->id) - (ia->id < ib->id); } return ret; } static int cmp_mutation(const void *a, const void *b) { const mutation_sort_t *ia = (const mutation_sort_t *) a; const mutation_sort_t *ib = (const mutation_sort_t *) b; /* Compare mutations by site */ int ret = (ia->mut.site > ib->mut.site) - (ia->mut.site < ib->mut.site); /* Within a particular site sort by time if known */ if (ret == 0 && !tsk_is_unknown_time(ia->mut.time) && !tsk_is_unknown_time(ib->mut.time)) { ret = (ia->mut.time < ib->mut.time) - (ia->mut.time > ib->mut.time); } /* Or node times when mutation times are unknown or equal */ if (ret == 0) { ret = (ia->node_time < ib->node_time) - (ia->node_time > ib->node_time); } /* If node times are equal, sort by number of descendants */ if (ret == 0) { ret = (ia->num_descendants < ib->num_descendants) - (ia->num_descendants > ib->num_descendants); } /* If number of descendants are equal, sort by node */ if (ret == 0) { ret = (ia->mut.node > ib->mut.node) - (ia->mut.node < ib->mut.node); } /* Final tiebreaker: ID */ if (ret == 0) { ret = (ia->mut.id > ib->mut.id) - (ia->mut.id < ib->mut.id); } return ret; } static int cmp_individual_canonical(const void *a, const void *b) { const individual_canonical_sort_t *ia = (const individual_canonical_sort_t *) a; const individual_canonical_sort_t *ib = (const individual_canonical_sort_t *) b; int ret = (ia->num_descendants < ib->num_descendants) - (ia->num_descendants > ib->num_descendants); if (ret == 0) { ret = (ia->first_node > ib->first_node) - (ia->first_node < ib->first_node); } if (ret == 0) { ret = (ia->ind.id > ib->ind.id) - (ia->ind.id < ib->ind.id); } return ret; } static int cmp_edge(const void *a, const void *b) { const edge_sort_t *ca = (const edge_sort_t *) a; const edge_sort_t *cb = (const edge_sort_t *) b; int ret = (ca->time > cb->time) - (ca->time < cb->time); /* If time values are equal, sort by the parent node */ if (ret == 0) { ret = (ca->parent > cb->parent) - (ca->parent < cb->parent); /* If the parent nodes are equal, sort by the child ID. */ if (ret == 0) { ret = (ca->child > cb->child) - (ca->child < cb->child); /* If the child nodes are equal, sort by the left coordinate. */ if (ret == 0) { ret = (ca->left > cb->left) - (ca->left < cb->left); } } } return ret; } static int cmp_migration(const void *a, const void *b) { const migration_sort_t *ca = (const migration_sort_t *) a; const migration_sort_t *cb = (const migration_sort_t *) b; int ret = (ca->time > cb->time) - (ca->time < cb->time); /* If time values are equal, sort by the source population */ if (ret == 0) { ret = (ca->source > cb->source) - (ca->source < cb->source); /* If the source populations are equal, sort by the dest */ if (ret == 0) { ret = (ca->dest > cb->dest) - (ca->dest < cb->dest); /* If the dest populations are equal, sort by the left coordinate. */ if (ret == 0) { ret = (ca->left > cb->left) - (ca->left < cb->left); /* If everything else is equal, compare by node */ if (ret == 0) { ret = (ca->node > cb->node) - (ca->node < cb->node); } } } } return ret; } static int tsk_table_sorter_sort_edges(tsk_table_sorter_t *self, tsk_size_t start) { int ret = 0; const tsk_edge_table_t *edges = &self->tables->edges; const double *restrict node_time = self->tables->nodes.time; edge_sort_t *e; tsk_size_t j, k, metadata_offset; tsk_size_t n = edges->num_rows - start; edge_sort_t *sorted_edges = tsk_malloc(n * sizeof(*sorted_edges)); char *old_metadata = tsk_malloc(edges->metadata_length); bool has_metadata = tsk_edge_table_has_metadata(edges); if (sorted_edges == NULL || old_metadata == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memcpy(old_metadata, edges->metadata, edges->metadata_length); for (j = 0; j < n; j++) { e = sorted_edges + j; k = start + j; e->left = edges->left[k]; e->right = edges->right[k]; e->parent = edges->parent[k]; e->child = edges->child[k]; e->time = node_time[e->parent]; if (has_metadata) { e->metadata_offset = edges->metadata_offset[k]; e->metadata_length = edges->metadata_offset[k + 1] - edges->metadata_offset[k]; } } qsort(sorted_edges, (size_t) n, sizeof(edge_sort_t), cmp_edge); /* Copy the edges back into the table. */ metadata_offset = 0; for (j = 0; j < n; j++) { e = sorted_edges + j; k = start + j; edges->left[k] = e->left; edges->right[k] = e->right; edges->parent[k] = e->parent; edges->child[k] = e->child; if (has_metadata) { tsk_memcpy(edges->metadata + metadata_offset, old_metadata + e->metadata_offset, e->metadata_length); edges->metadata_offset[k] = metadata_offset; metadata_offset += e->metadata_length; } } out: tsk_safe_free(sorted_edges); tsk_safe_free(old_metadata); return ret; } static int tsk_table_sorter_sort_migrations(tsk_table_sorter_t *self, tsk_size_t start) { int ret = 0; const tsk_migration_table_t *migrations = &self->tables->migrations; migration_sort_t *m; tsk_size_t j, k, metadata_offset; tsk_size_t n = migrations->num_rows - start; migration_sort_t *sorted_migrations = tsk_malloc(n * sizeof(*sorted_migrations)); char *old_metadata = tsk_malloc(migrations->metadata_length); if (sorted_migrations == NULL || old_metadata == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memcpy(old_metadata, migrations->metadata, migrations->metadata_length); for (j = 0; j < n; j++) { m = sorted_migrations + j; k = start + j; m->left = migrations->left[k]; m->right = migrations->right[k]; m->node = migrations->node[k]; m->source = migrations->source[k]; m->dest = migrations->dest[k]; m->time = migrations->time[k]; m->metadata_offset = migrations->metadata_offset[k]; m->metadata_length = migrations->metadata_offset[k + 1] - migrations->metadata_offset[k]; } qsort(sorted_migrations, (size_t) n, sizeof(migration_sort_t), cmp_migration); /* Copy the migrations back into the table. */ metadata_offset = 0; for (j = 0; j < n; j++) { m = sorted_migrations + j; k = start + j; migrations->left[k] = m->left; migrations->right[k] = m->right; migrations->node[k] = m->node; migrations->source[k] = m->source; migrations->dest[k] = m->dest; migrations->time[k] = m->time; tsk_memcpy(migrations->metadata + metadata_offset, old_metadata + m->metadata_offset, m->metadata_length); migrations->metadata_offset[k] = metadata_offset; metadata_offset += m->metadata_length; } out: tsk_safe_free(sorted_migrations); tsk_safe_free(old_metadata); return ret; } static int tsk_table_sorter_sort_sites(tsk_table_sorter_t *self) { int ret = 0; tsk_id_t ret_id; tsk_site_table_t *sites = &self->tables->sites; tsk_site_table_t copy; tsk_size_t j; tsk_size_t num_sites = sites->num_rows; tsk_site_t *sorted_sites = tsk_malloc(num_sites * sizeof(*sorted_sites)); ret = tsk_site_table_copy(sites, ©, 0); if (ret != 0) { goto out; } if (sorted_sites == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < num_sites; j++) { tsk_site_table_get_row_unsafe(©, (tsk_id_t) j, sorted_sites + j); } /* Sort the sites by position */ qsort(sorted_sites, (size_t) num_sites, sizeof(*sorted_sites), cmp_site); /* Build the mapping from old site IDs to new site IDs and copy back into the * table */ tsk_site_table_clear(sites); for (j = 0; j < num_sites; j++) { self->site_id_map[sorted_sites[j].id] = (tsk_id_t) j; ret_id = tsk_site_table_add_row(sites, sorted_sites[j].position, sorted_sites[j].ancestral_state, sorted_sites[j].ancestral_state_length, sorted_sites[j].metadata, sorted_sites[j].metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; out: tsk_safe_free(sorted_sites); tsk_site_table_free(©); return ret; } static int tsk_table_sorter_sort_mutations(tsk_table_sorter_t *self) { int ret = 0; tsk_size_t j; tsk_id_t ret_id, parent, mapped_parent, p; tsk_mutation_table_t *mutations = &self->tables->mutations; tsk_node_table_t *nodes = &self->tables->nodes; tsk_size_t num_mutations = mutations->num_rows; tsk_mutation_table_t copy; mutation_sort_t *sorted_mutations = tsk_malloc(num_mutations * sizeof(*sorted_mutations)); tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map)); ret = tsk_mutation_table_copy(mutations, ©, 0); if (ret != 0) { goto out; } if (mutation_id_map == NULL || sorted_mutations == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } /* compute numbers of descendants for each mutation */ for (j = 0; j < num_mutations; j++) { sorted_mutations[j].num_descendants = 0; } for (j = 0; j < num_mutations; j++) { p = mutations->parent[j]; while (p != TSK_NULL) { sorted_mutations[p].num_descendants += 1; if (sorted_mutations[p].num_descendants > (int) num_mutations) { ret = tsk_trace_error(TSK_ERR_MUTATION_PARENT_INCONSISTENT); goto out; } p = mutations->parent[p]; } } for (j = 0; j < num_mutations; j++) { tsk_mutation_table_get_row_unsafe(©, (tsk_id_t) j, &sorted_mutations[j].mut); sorted_mutations[j].mut.site = self->site_id_map[sorted_mutations[j].mut.site]; sorted_mutations[j].node_time = nodes->time[sorted_mutations[j].mut.node]; } ret = tsk_mutation_table_clear(mutations); if (ret != 0) { goto out; } qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations), cmp_mutation); /* Make a first pass through the sorted mutations to build the ID map. */ for (j = 0; j < num_mutations; j++) { mutation_id_map[sorted_mutations[j].mut.id] = (tsk_id_t) j; } for (j = 0; j < num_mutations; j++) { mapped_parent = TSK_NULL; parent = sorted_mutations[j].mut.parent; if (parent != TSK_NULL) { mapped_parent = mutation_id_map[parent]; } ret_id = tsk_mutation_table_add_row(mutations, sorted_mutations[j].mut.site, sorted_mutations[j].mut.node, mapped_parent, sorted_mutations[j].mut.time, sorted_mutations[j].mut.derived_state, sorted_mutations[j].mut.derived_state_length, sorted_mutations[j].mut.metadata, sorted_mutations[j].mut.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; out: tsk_safe_free(mutation_id_map); tsk_safe_free(sorted_mutations); tsk_mutation_table_free(©); return ret; } static int tsk_individual_table_topological_sort( tsk_individual_table_t *self, tsk_id_t *traversal_order, tsk_size_t *num_descendants) { int ret = 0; tsk_id_t i, j, p; tsk_individual_t individual; tsk_size_t num_individuals = self->num_rows; tsk_size_t current_todo = 0; tsk_size_t todo_insertion_point = 0; tsk_size_t *incoming_edge_count = tsk_malloc(num_individuals * sizeof(*incoming_edge_count)); bool count_descendants = (num_descendants != NULL); if (incoming_edge_count == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (i = 0; i < (tsk_id_t) num_individuals; i++) { incoming_edge_count[i] = 0; traversal_order[i] = TSK_NULL; if (count_descendants) { num_descendants[i] = 0; } } /* First find the set of individuals that have no children by creating * an array of incoming edge counts */ for (i = 0; i < (tsk_id_t) self->parents_length; i++) { if (self->parents[i] != TSK_NULL) { incoming_edge_count[self->parents[i]]++; } } /* Use these as the starting points for checking all individuals, * doing this in reverse makes the sort stable */ for (i = (tsk_id_t) num_individuals - 1; i >= 0; i--) { if (incoming_edge_count[i] == 0) { traversal_order[todo_insertion_point] = i; todo_insertion_point++; } } /* Now process individuals from the set that have no children, updating their * parents' information as we go, and adding their parents to the list if * this was their last child */ while (current_todo < todo_insertion_point) { j = traversal_order[current_todo]; tsk_individual_table_get_row_unsafe(self, j, &individual); for (i = 0; i < (tsk_id_t) individual.parents_length; i++) { p = individual.parents[i]; if (p != TSK_NULL) { incoming_edge_count[p]--; if (count_descendants) { num_descendants[p] += 1 + num_descendants[j]; } if (incoming_edge_count[p] == 0) { traversal_order[todo_insertion_point] = p; todo_insertion_point++; } } } current_todo++; } /* Any edges left are parts of cycles */ for (i = 0; i < (tsk_id_t) num_individuals; i++) { if (incoming_edge_count[i] > 0) { ret = tsk_trace_error(TSK_ERR_INDIVIDUAL_PARENT_CYCLE); goto out; } } out: tsk_safe_free(incoming_edge_count); return ret; } int tsk_table_collection_individual_topological_sort( tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t i, ret_id; tsk_individual_table_t copy; tsk_individual_t individual; tsk_individual_table_t *individuals = &self->individuals; tsk_node_table_t *nodes = &self->nodes; tsk_size_t num_individuals = individuals->num_rows; tsk_id_t *traversal_order = tsk_malloc(num_individuals * sizeof(*traversal_order)); tsk_id_t *new_id_map = tsk_malloc(num_individuals * sizeof(*new_id_map)); if (new_id_map == NULL || traversal_order == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(new_id_map, 0xff, num_individuals * sizeof(*new_id_map)); ret = tsk_individual_table_copy(individuals, ©, 0); if (ret != 0) { goto out; } ret_id = tsk_table_collection_check_integrity(self, 0); if (ret_id != 0) { ret = (int) ret_id; goto out; } ret = tsk_individual_table_clear(individuals); if (ret != 0) { goto out; } ret = tsk_individual_table_topological_sort(©, traversal_order, NULL); if (ret != 0) { goto out; } /* The sorted individuals are in reverse order */ for (i = (tsk_id_t) num_individuals - 1; i >= 0; i--) { tsk_individual_table_get_row_unsafe(©, traversal_order[i], &individual); ret_id = tsk_individual_table_add_row(individuals, individual.flags, individual.location, individual.location_length, individual.parents, individual.parents_length, individual.metadata, individual.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } new_id_map[traversal_order[i]] = ret_id; } /* Rewrite the parent ids */ for (i = 0; i < (tsk_id_t) individuals->parents_length; i++) { if (individuals->parents[i] != TSK_NULL) { individuals->parents[i] = new_id_map[individuals->parents[i]]; } } /* Rewrite the node individual ids */ for (i = 0; i < (tsk_id_t) nodes->num_rows; i++) { if (nodes->individual[i] != TSK_NULL) { nodes->individual[i] = new_id_map[nodes->individual[i]]; } } ret = 0; out: tsk_safe_free(traversal_order); tsk_safe_free(new_id_map); tsk_individual_table_free(©); return ret; } static int tsk_table_sorter_sort_individuals_canonical(tsk_table_sorter_t *self) { int ret = 0; tsk_id_t ret_id, i, j, parent, mapped_parent; tsk_individual_table_t *individuals = &self->tables->individuals; tsk_node_table_t *nodes = &self->tables->nodes; tsk_individual_table_t copy; tsk_size_t num_individuals = individuals->num_rows; individual_canonical_sort_t *sorted_individuals = tsk_malloc(num_individuals * sizeof(*sorted_individuals)); tsk_id_t *individual_id_map = tsk_malloc(num_individuals * sizeof(*individual_id_map)); tsk_size_t *num_descendants = tsk_malloc(num_individuals * sizeof(*num_descendants)); tsk_id_t *traversal_order = tsk_malloc(num_individuals * sizeof(*traversal_order)); if (individual_id_map == NULL || sorted_individuals == NULL || traversal_order == NULL || num_descendants == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_individual_table_copy(individuals, ©, 0); if (ret != 0) { goto out; } ret = tsk_individual_table_clear(individuals); if (ret != 0) { goto out; } ret = tsk_individual_table_topological_sort(©, traversal_order, num_descendants); if (ret != 0) { goto out; } for (i = 0; i < (tsk_id_t) num_individuals; i++) { sorted_individuals[i].num_descendants = num_descendants[i]; sorted_individuals[i].first_node = (tsk_id_t) nodes->num_rows; } /* find first referring node */ for (j = 0; j < (tsk_id_t) nodes->num_rows; j++) { if (nodes->individual[j] != TSK_NULL) { sorted_individuals[nodes->individual[j]].first_node = TSK_MIN(j, sorted_individuals[nodes->individual[j]].first_node); } } for (j = 0; j < (tsk_id_t) num_individuals; j++) { tsk_individual_table_get_row_unsafe( ©, (tsk_id_t) j, &sorted_individuals[j].ind); } qsort(sorted_individuals, (size_t) num_individuals, sizeof(*sorted_individuals), cmp_individual_canonical); /* Make a first pass through the sorted individuals to build the ID map. */ for (j = 0; j < (tsk_id_t) num_individuals; j++) { individual_id_map[sorted_individuals[j].ind.id] = (tsk_id_t) j; } for (i = 0; i < (tsk_id_t) num_individuals; i++) { for (j = 0; j < (tsk_id_t) sorted_individuals[i].ind.parents_length; j++) { parent = sorted_individuals[i].ind.parents[j]; if (parent != TSK_NULL) { mapped_parent = individual_id_map[parent]; sorted_individuals[i].ind.parents[j] = mapped_parent; } } ret_id = tsk_individual_table_add_row(individuals, sorted_individuals[i].ind.flags, sorted_individuals[i].ind.location, sorted_individuals[i].ind.location_length, sorted_individuals[i].ind.parents, sorted_individuals[i].ind.parents_length, sorted_individuals[i].ind.metadata, sorted_individuals[i].ind.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } ret = 0; /* remap individuals in the node table */ for (i = 0; i < (tsk_id_t) nodes->num_rows; i++) { j = nodes->individual[i]; if (j != TSK_NULL) { nodes->individual[i] = individual_id_map[j]; } } out: tsk_safe_free(sorted_individuals); tsk_safe_free(individual_id_map); tsk_safe_free(traversal_order); tsk_safe_free(num_descendants); tsk_individual_table_free(©); return ret; } int tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) { int ret = 0; tsk_size_t edge_start = 0; tsk_size_t migration_start = 0; bool skip_sites = false; bool skip_individuals = false; if (start != NULL) { if (start->edges > self->tables->edges.num_rows) { ret = tsk_trace_error(TSK_ERR_EDGE_OUT_OF_BOUNDS); goto out; } edge_start = start->edges; if (start->migrations > self->tables->migrations.num_rows) { ret = tsk_trace_error(TSK_ERR_MIGRATION_OUT_OF_BOUNDS); goto out; } migration_start = start->migrations; /* We only allow sites and mutations to be specified as a way to * skip sorting them entirely. Both sites and mutations must be * equal to the number of rows */ if (start->sites == self->tables->sites.num_rows && start->mutations == self->tables->mutations.num_rows) { skip_sites = true; } else if (start->sites != 0 || start->mutations != 0) { ret = tsk_trace_error(TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); goto out; } } /* The indexes will be invalidated, so drop them */ ret = tsk_table_collection_drop_index(self->tables, 0); if (ret != 0) { goto out; } if (self->sort_edges != NULL) { ret = self->sort_edges(self, edge_start); if (ret != 0) { goto out; } } /* Avoid calling sort_migrations in the common case when it's a no-op */ if (self->tables->migrations.num_rows > 0) { ret = tsk_table_sorter_sort_migrations(self, migration_start); if (ret != 0) { goto out; } } if (!skip_sites) { ret = tsk_table_sorter_sort_sites(self); if (ret != 0) { goto out; } ret = self->sort_mutations(self); if (ret != 0) { goto out; } } if (!skip_individuals && self->sort_individuals != NULL) { ret = self->sort_individuals(self); if (ret != 0) { goto out; } } out: return ret; } int tsk_table_sorter_init( tsk_table_sorter_t *self, tsk_table_collection_t *tables, tsk_flags_t options) { int ret = 0; tsk_id_t ret_id; tsk_memset(self, 0, sizeof(tsk_table_sorter_t)); if (!(options & TSK_NO_CHECK_INTEGRITY)) { ret_id = tsk_table_collection_check_integrity(tables, 0); if (ret_id != 0) { ret = (int) ret_id; goto out; } } self->tables = tables; self->site_id_map = tsk_malloc(self->tables->sites.num_rows * sizeof(tsk_id_t)); if (self->site_id_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } /* Set the sort_edges and sort_mutations methods to the default. */ self->sort_edges = tsk_table_sorter_sort_edges; self->sort_mutations = tsk_table_sorter_sort_mutations; /* Default sort doesn't touch individuals */ self->sort_individuals = NULL; out: return ret; } int tsk_table_sorter_free(tsk_table_sorter_t *self) { tsk_safe_free(self->site_id_map); return 0; } /************************* * segment overlapper *************************/ typedef struct _interval_list_t { double left; double right; struct _interval_list_t *next; } interval_list_t; typedef struct _mutation_id_list_t { tsk_id_t mutation; struct _mutation_id_list_t *next; } mutation_id_list_t; typedef struct _tsk_segment_t { double left; double right; struct _tsk_segment_t *next; tsk_id_t node; } tsk_segment_t; /* segment overlap finding algorithm */ typedef struct { /* The input segments. This buffer is sorted by the algorithm and we also * assume that there is space for an extra element at the end */ tsk_segment_t *segments; tsk_size_t num_segments; tsk_size_t index; tsk_size_t num_overlapping; double left; double right; /* Output buffer */ tsk_size_t max_overlapping; tsk_segment_t **overlapping; } segment_overlapper_t; typedef struct { tsk_size_t num_samples; tsk_flags_t options; tsk_table_collection_t *tables; /* Keep a copy of the input tables */ tsk_table_collection_t input_tables; /* State for topology */ tsk_segment_t **ancestor_map_head; tsk_segment_t **ancestor_map_tail; /* Mapping of input node IDs to output node IDs. */ tsk_id_t *node_id_map; bool *is_sample; /* Segments for a particular parent that are processed together */ tsk_segment_t *segment_queue; tsk_size_t segment_queue_size; tsk_size_t max_segment_queue_size; segment_overlapper_t segment_overlapper; tsk_blkalloc_t segment_heap; /* Buffer for output edges. For each child we keep a linked list of * intervals, and also store the actual children that have been buffered. */ tsk_blkalloc_t interval_list_heap; interval_list_t **child_edge_map_head; interval_list_t **child_edge_map_tail; tsk_id_t *buffered_children; tsk_size_t num_buffered_children; /* For each mutation, map its output node. */ tsk_id_t *mutation_node_map; /* Map of input nodes to the list of input mutation IDs */ mutation_id_list_t **node_mutation_list_map_head; mutation_id_list_t **node_mutation_list_map_tail; mutation_id_list_t *node_mutation_list_mem; /* When reducing topology, we need a map positions to their corresponding * sites.*/ double *position_lookup; int64_t edge_sort_offset; } simplifier_t; static int cmp_segment(const void *a, const void *b) { const tsk_segment_t *ia = (const tsk_segment_t *) a; const tsk_segment_t *ib = (const tsk_segment_t *) b; int ret = (ia->left > ib->left) - (ia->left < ib->left); /* Break ties using the node */ if (ret == 0) { ret = (ia->node > ib->node) - (ia->node < ib->node); } return ret; } static int TSK_WARN_UNUSED segment_overlapper_alloc(segment_overlapper_t *self) { int ret = 0; tsk_memset(self, 0, sizeof(*self)); self->max_overlapping = 8; /* Making sure we call tsk_realloc in tests */ self->overlapping = tsk_malloc(self->max_overlapping * sizeof(*self->overlapping)); if (self->overlapping == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } out: return ret; } static int segment_overlapper_free(segment_overlapper_t *self) { tsk_safe_free(self->overlapping); return 0; } /* Initialise the segment overlapper for use. Note that the segments * array must have space for num_segments + 1 elements! */ static int TSK_WARN_UNUSED segment_overlapper_start( segment_overlapper_t *self, tsk_segment_t *segments, tsk_size_t num_segments) { int ret = 0; tsk_segment_t *sentinel; void *p; if (self->max_overlapping < num_segments) { self->max_overlapping = num_segments; p = tsk_realloc( self->overlapping, self->max_overlapping * sizeof(*self->overlapping)); if (p == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->overlapping = p; } self->segments = segments; self->num_segments = num_segments; self->index = 0; self->num_overlapping = 0; self->left = 0; self->right = DBL_MAX; /* Sort the segments in the buffer by left coordinate */ qsort( self->segments, (size_t) self->num_segments, sizeof(tsk_segment_t), cmp_segment); /* NOTE! We are assuming that there's space for another element on the end * here. This is to insert a sentinel which simplifies the logic. */ sentinel = self->segments + self->num_segments; sentinel->left = DBL_MAX; out: return ret; } static int TSK_WARN_UNUSED segment_overlapper_next(segment_overlapper_t *self, double *left, double *right, tsk_segment_t ***overlapping, tsk_size_t *num_overlapping) { int ret = 0; tsk_size_t j, k; tsk_size_t n = self->num_segments; tsk_segment_t *S = self->segments; if (self->index < n) { self->left = self->right; /* Remove any elements of X with right <= left */ k = 0; for (j = 0; j < self->num_overlapping; j++) { if (self->overlapping[j]->right > self->left) { self->overlapping[k] = self->overlapping[j]; k++; } } self->num_overlapping = k; if (k == 0) { self->left = S[self->index].left; } while (self->index < n && S[self->index].left == self->left) { tsk_bug_assert(self->num_overlapping < self->max_overlapping); self->overlapping[self->num_overlapping] = &S[self->index]; self->num_overlapping++; self->index++; } self->index--; self->right = S[self->index + 1].left; for (j = 0; j < self->num_overlapping; j++) { self->right = TSK_MIN(self->right, self->overlapping[j]->right); } tsk_bug_assert(self->left < self->right); self->index++; ret = 1; } else { self->left = self->right; self->right = DBL_MAX; k = 0; for (j = 0; j < self->num_overlapping; j++) { if (self->overlapping[j]->right > self->left) { self->right = TSK_MIN(self->right, self->overlapping[j]->right); self->overlapping[k] = self->overlapping[j]; k++; } } self->num_overlapping = k; if (k > 0) { ret = 1; } } *left = self->left; *right = self->right; *overlapping = self->overlapping; *num_overlapping = self->num_overlapping; return ret; } static int cmp_node_id(const void *a, const void *b) { const tsk_id_t *ia = (const tsk_id_t *) a; const tsk_id_t *ib = (const tsk_id_t *) b; return (*ia > *ib) - (*ia < *ib); } /************************* * Ancestor mapper *************************/ /* NOTE: this struct shares a lot with the simplifier_t, mostly in * terms of infrastructure for managing the list of intervals, saving * edges etc. We should try to abstract the common functionality out * into a separate class, which handles this. */ typedef struct { tsk_id_t *samples; tsk_size_t num_samples; tsk_id_t *ancestors; tsk_size_t num_ancestors; tsk_table_collection_t *tables; tsk_edge_table_t *result; tsk_segment_t **ancestor_map_head; tsk_segment_t **ancestor_map_tail; bool *is_sample; bool *is_ancestor; tsk_segment_t *segment_queue; tsk_size_t segment_queue_size; tsk_size_t max_segment_queue_size; segment_overlapper_t segment_overlapper; tsk_blkalloc_t segment_heap; tsk_blkalloc_t interval_list_heap; interval_list_t **child_edge_map_head; interval_list_t **child_edge_map_tail; tsk_id_t *buffered_children; tsk_size_t num_buffered_children; double sequence_length; double oldest_node_time; } ancestor_mapper_t; static tsk_segment_t *TSK_WARN_UNUSED ancestor_mapper_alloc_segment( ancestor_mapper_t *self, double left, double right, tsk_id_t node) { tsk_segment_t *seg = NULL; seg = tsk_blkalloc_get(&self->segment_heap, sizeof(*seg)); if (seg == NULL) { goto out; } seg->next = NULL; seg->left = left; seg->right = right; seg->node = node; out: return seg; } static interval_list_t *TSK_WARN_UNUSED ancestor_mapper_alloc_interval_list(ancestor_mapper_t *self, double left, double right) { interval_list_t *x = NULL; x = tsk_blkalloc_get(&self->interval_list_heap, sizeof(*x)); if (x == NULL) { goto out; } x->next = NULL; x->left = left; x->right = right; out: return x; } static int ancestor_mapper_flush_edges( ancestor_mapper_t *self, tsk_id_t parent, tsk_size_t *ret_num_edges) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_id_t child; interval_list_t *x; tsk_size_t num_edges = 0; qsort(self->buffered_children, (size_t) self->num_buffered_children, sizeof(tsk_id_t), cmp_node_id); for (j = 0; j < self->num_buffered_children; j++) { child = self->buffered_children[j]; for (x = self->child_edge_map_head[child]; x != NULL; x = x->next) { ret_id = tsk_edge_table_add_row( self->result, x->left, x->right, parent, child, NULL, 0); if (ret_id < 0) { ret = (int) ret_id; goto out; } num_edges++; } self->child_edge_map_head[child] = NULL; self->child_edge_map_tail[child] = NULL; } self->num_buffered_children = 0; *ret_num_edges = num_edges; ret = tsk_blkalloc_reset(&self->interval_list_heap); out: return ret; } static int ancestor_mapper_record_edge( ancestor_mapper_t *self, double left, double right, tsk_id_t child) { int ret = 0; interval_list_t *tail, *x; tail = self->child_edge_map_tail[child]; if (tail == NULL) { tsk_bug_assert(self->num_buffered_children < self->tables->nodes.num_rows); self->buffered_children[self->num_buffered_children] = child; self->num_buffered_children++; x = ancestor_mapper_alloc_interval_list(self, left, right); if (x == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->child_edge_map_head[child] = x; self->child_edge_map_tail[child] = x; } else { if (tail->right == left) { tail->right = right; } else { x = ancestor_mapper_alloc_interval_list(self, left, right); if (x == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tail->next = x; self->child_edge_map_tail[child] = x; } } out: return ret; } static int TSK_WARN_UNUSED ancestor_mapper_add_ancestry(ancestor_mapper_t *self, tsk_id_t input_id, double left, double right, tsk_id_t output_id) { int ret = 0; tsk_segment_t *tail = self->ancestor_map_tail[input_id]; tsk_segment_t *x; tsk_bug_assert(left < right); if (tail == NULL) { x = ancestor_mapper_alloc_segment(self, left, right, output_id); if (x == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->ancestor_map_head[input_id] = x; self->ancestor_map_tail[input_id] = x; } else { if (tail->right == left && tail->node == output_id) { tail->right = right; } else { x = ancestor_mapper_alloc_segment(self, left, right, output_id); if (x == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tail->next = x; self->ancestor_map_tail[input_id] = x; } } out: return ret; } static void ancestor_mapper_find_oldest_node(ancestor_mapper_t *self) { const double *node_time = self->tables->nodes.time; tsk_size_t j; double max_time = -1; for (j = 0; j < self->num_ancestors; j++) { max_time = TSK_MAX(max_time, node_time[self->ancestors[j]]); } for (j = 0; j < self->num_samples; j++) { max_time = TSK_MAX(max_time, node_time[self->samples[j]]); } self->oldest_node_time = max_time; } static int ancestor_mapper_init_samples(ancestor_mapper_t *self, tsk_id_t *samples) { int ret = 0; tsk_size_t j; /* Go through the samples to check for errors. */ for (j = 0; j < self->num_samples; j++) { if (samples[j] < 0 || samples[j] > (tsk_id_t) self->tables->nodes.num_rows) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (self->is_sample[samples[j]]) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } self->is_sample[samples[j]] = true; ret = ancestor_mapper_add_ancestry( self, samples[j], 0, self->tables->sequence_length, samples[j]); if (ret != 0) { goto out; } } out: return ret; } static int ancestor_mapper_init_ancestors(ancestor_mapper_t *self, tsk_id_t *ancestors) { int ret = 0; tsk_size_t j; /* Go through the samples to check for errors. */ for (j = 0; j < self->num_ancestors; j++) { if (ancestors[j] < 0 || ancestors[j] > (tsk_id_t) self->tables->nodes.num_rows) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (self->is_ancestor[ancestors[j]]) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } self->is_ancestor[ancestors[j]] = true; } out: return ret; } static int ancestor_mapper_init(ancestor_mapper_t *self, tsk_id_t *samples, tsk_size_t num_samples, tsk_id_t *ancestors, tsk_size_t num_ancestors, tsk_table_collection_t *tables, tsk_edge_table_t *result) { int ret = 0; tsk_size_t num_nodes; tsk_memset(self, 0, sizeof(ancestor_mapper_t)); self->num_samples = num_samples; self->num_ancestors = num_ancestors; self->samples = samples; self->ancestors = ancestors; self->tables = tables; self->result = result; self->sequence_length = self->tables->sequence_length; if (samples == NULL || num_samples == 0 || ancestors == NULL || num_ancestors == 0) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } /* Allocate the heaps used for small objects-> Assuming 8K is a good chunk size */ ret = tsk_blkalloc_init(&self->segment_heap, 8192); if (ret != 0) { goto out; } ret = tsk_blkalloc_init(&self->interval_list_heap, 8192); if (ret != 0) { goto out; } ret = segment_overlapper_alloc(&self->segment_overlapper); if (ret != 0) { goto out; } num_nodes = tables->nodes.num_rows; /* Make the maps and set the intial state */ self->ancestor_map_head = tsk_calloc(num_nodes, sizeof(tsk_segment_t *)); self->ancestor_map_tail = tsk_calloc(num_nodes, sizeof(tsk_segment_t *)); self->child_edge_map_head = tsk_calloc(num_nodes, sizeof(interval_list_t *)); self->child_edge_map_tail = tsk_calloc(num_nodes, sizeof(interval_list_t *)); self->buffered_children = tsk_malloc(num_nodes * sizeof(tsk_id_t)); self->is_sample = tsk_calloc(num_nodes, sizeof(bool)); self->is_ancestor = tsk_calloc(num_nodes, sizeof(bool)); self->max_segment_queue_size = 64; self->segment_queue = tsk_malloc(self->max_segment_queue_size * sizeof(tsk_segment_t)); if (self->ancestor_map_head == NULL || self->ancestor_map_tail == NULL || self->child_edge_map_head == NULL || self->child_edge_map_tail == NULL || self->is_sample == NULL || self->is_ancestor == NULL || self->segment_queue == NULL || self->buffered_children == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } // Clear memory. ret = ancestor_mapper_init_samples(self, samples); if (ret != 0) { goto out; } ret = ancestor_mapper_init_ancestors(self, ancestors); if (ret != 0) { goto out; } ancestor_mapper_find_oldest_node(self); ret = tsk_edge_table_clear(self->result); if (ret != 0) { goto out; } out: return ret; } static int ancestor_mapper_free(ancestor_mapper_t *self) { tsk_blkalloc_free(&self->segment_heap); tsk_blkalloc_free(&self->interval_list_heap); segment_overlapper_free(&self->segment_overlapper); tsk_safe_free(self->ancestor_map_head); tsk_safe_free(self->ancestor_map_tail); tsk_safe_free(self->child_edge_map_head); tsk_safe_free(self->child_edge_map_tail); tsk_safe_free(self->segment_queue); tsk_safe_free(self->is_sample); tsk_safe_free(self->is_ancestor); tsk_safe_free(self->buffered_children); return 0; } static int TSK_WARN_UNUSED ancestor_mapper_enqueue_segment( ancestor_mapper_t *self, double left, double right, tsk_id_t node) { int ret = 0; tsk_segment_t *seg; void *p; tsk_bug_assert(left < right); /* Make sure we always have room for one more segment in the queue so we * can put a tail sentinel on it */ if (self->segment_queue_size == self->max_segment_queue_size - 1) { self->max_segment_queue_size *= 2; p = tsk_realloc(self->segment_queue, self->max_segment_queue_size * sizeof(*self->segment_queue)); if (p == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->segment_queue = p; } seg = self->segment_queue + self->segment_queue_size; seg->left = left; seg->right = right; seg->node = node; self->segment_queue_size++; out: return ret; } static int TSK_WARN_UNUSED ancestor_mapper_merge_ancestors(ancestor_mapper_t *self, tsk_id_t input_id) { int ret = 0; tsk_segment_t **X, *x; tsk_size_t j, num_overlapping, num_flushed_edges; double left, right, prev_right; bool is_sample = self->is_sample[input_id]; bool is_ancestor = self->is_ancestor[input_id]; if (is_sample) { /* Free up the existing ancestry mapping. */ x = self->ancestor_map_tail[input_id]; tsk_bug_assert(x->left == 0 && x->right == self->sequence_length); self->ancestor_map_head[input_id] = NULL; self->ancestor_map_tail[input_id] = NULL; } ret = segment_overlapper_start( &self->segment_overlapper, self->segment_queue, self->segment_queue_size); if (ret != 0) { goto out; } prev_right = 0; while ((ret = segment_overlapper_next( &self->segment_overlapper, &left, &right, &X, &num_overlapping)) == 1) { tsk_bug_assert(left < right); tsk_bug_assert(num_overlapping > 0); if (is_ancestor || is_sample) { for (j = 0; j < num_overlapping; j++) { ret = ancestor_mapper_record_edge(self, left, right, X[j]->node); if (ret != 0) { goto out; } } ret = ancestor_mapper_add_ancestry(self, input_id, left, right, input_id); if (ret != 0) { goto out; } if (is_sample && left != prev_right) { /* Fill in any gaps in ancestry for the sample */ ret = ancestor_mapper_add_ancestry( self, input_id, prev_right, left, input_id); if (ret != 0) { goto out; } } } else { for (j = 0; j < num_overlapping; j++) { ret = ancestor_mapper_add_ancestry( self, input_id, left, right, X[j]->node); if (ret != 0) { goto out; } } } prev_right = right; } if (is_sample && prev_right != self->tables->sequence_length) { /* If a trailing gap exists in the sample ancestry, fill it in. */ ret = ancestor_mapper_add_ancestry( self, input_id, prev_right, self->sequence_length, input_id); if (ret != 0) { goto out; } } if (input_id != TSK_NULL) { ret = ancestor_mapper_flush_edges(self, input_id, &num_flushed_edges); if (ret != 0) { goto out; } } out: return ret; } static int TSK_WARN_UNUSED ancestor_mapper_process_parent_edges( ancestor_mapper_t *self, tsk_id_t parent, tsk_size_t start, tsk_size_t end) { int ret = 0; tsk_size_t j; tsk_segment_t *x; const tsk_edge_table_t *input_edges = &self->tables->edges; tsk_id_t child; double left, right; /* Go through the edges and queue up ancestry segments for processing. */ self->segment_queue_size = 0; for (j = start; j < end; j++) { tsk_bug_assert(parent == input_edges->parent[j]); child = input_edges->child[j]; left = input_edges->left[j]; right = input_edges->right[j]; // printf("C: %i, L: %f, R: %f\n", child, left, right); for (x = self->ancestor_map_head[child]; x != NULL; x = x->next) { if (x->right > left && right > x->left) { ret = ancestor_mapper_enqueue_segment( self, TSK_MAX(x->left, left), TSK_MIN(x->right, right), x->node); if (ret != 0) { goto out; } } } } // We can now merge the ancestral segments for the parent ret = ancestor_mapper_merge_ancestors(self, parent); if (ret != 0) { goto out; } out: return ret; } static int TSK_WARN_UNUSED ancestor_mapper_run(ancestor_mapper_t *self) { int ret = 0; tsk_size_t j, start; tsk_id_t parent, current_parent; const tsk_edge_table_t *input_edges = &self->tables->edges; tsk_size_t num_edges = input_edges->num_rows; const double *node_time = self->tables->nodes.time; bool early_exit = false; if (num_edges > 0) { start = 0; current_parent = input_edges->parent[0]; for (j = 0; j < num_edges; j++) { parent = input_edges->parent[j]; if (parent != current_parent) { ret = ancestor_mapper_process_parent_edges( self, current_parent, start, j); if (ret != 0) { goto out; } start = j; current_parent = parent; if (node_time[current_parent] > self->oldest_node_time) { early_exit = true; break; } } } if (!early_exit) { /* If we didn't break out of the loop early, we need to still process * the final parent */ ret = ancestor_mapper_process_parent_edges(self, current_parent, start, j); if (ret != 0) { goto out; } } } out: return ret; } /************************* * IBD Segments *************************/ /* This maps two positive integers 0 <= a < b < N into the set * {0, ..., N^2}. For us to overflow an int64, N would need to * be > sqrt(2^63), ~3 * 10^9. The maximum value for a 32bit int * is ~2 * 10^9, so this can't happen here, however it is * theoretically possible with 64 bit IDs. It would require * a *very* large node table --- assuming 24 bytes per row * it would be at least 67GiB. To make sure this eventuality * doesn't happen, we have a tsk_bug_assert in the * tsk_identity_segments_init. */ static inline int64_t pair_to_integer(tsk_id_t a, tsk_id_t b, tsk_size_t N) { tsk_id_t tmp; if (a > b) { tmp = a; a = b; b = tmp; } return ((int64_t) a) * (int64_t) N + (int64_t) b; } static inline void integer_to_pair(int64_t index, tsk_size_t N, tsk_id_t *a, tsk_id_t *b) { *a = (tsk_id_t) (index / (int64_t) N); *b = (tsk_id_t) (index % (int64_t) N); } static int64_t tsk_identity_segments_get_key( const tsk_identity_segments_t *self, tsk_id_t a, tsk_id_t b) { int64_t ret; tsk_id_t N = (tsk_id_t) self->num_nodes; if (a < 0 || b < 0 || a >= N || b >= N) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (a == b) { ret = tsk_trace_error(TSK_ERR_SAME_NODES_IN_PAIR); goto out; } ret = pair_to_integer(a, b, self->num_nodes); out: return ret; } static tsk_identity_segment_t *TSK_WARN_UNUSED tsk_identity_segments_alloc_segment( tsk_identity_segments_t *self, double left, double right, tsk_id_t node) { tsk_identity_segment_t *seg = tsk_blkalloc_get(&self->heap, sizeof(*seg)); if (seg == NULL) { goto out; } tsk_bug_assert(left < right); tsk_bug_assert(node >= 0 && node < (tsk_id_t) self->num_nodes); seg->next = NULL; seg->left = left; seg->right = right; seg->node = node; out: return seg; } static tsk_avl_node_int_t * tsk_identity_segments_alloc_new_pair(tsk_identity_segments_t *self, int64_t key) { tsk_avl_node_int_t *avl_node = tsk_blkalloc_get(&self->heap, sizeof(*avl_node)); tsk_identity_segment_list_t *list = tsk_blkalloc_get(&self->heap, sizeof(*list)); if (avl_node == NULL || list == NULL) { return NULL; } avl_node->key = key; avl_node->value = list; memset(list, 0, sizeof(*list)); return avl_node; } /* Deliberately not making this a part of the public interface for now, * so we don't have to worry about the signature */ static int tsk_identity_segments_init( tsk_identity_segments_t *self, tsk_size_t num_nodes, tsk_flags_t options) { int ret = 0; /* Make sure we don't overflow in the ID mapping. See the comments in pair_to_integer * for details. */ double max_num_nodes = sqrt(1ULL << 63); tsk_bug_assert((double) num_nodes < max_num_nodes); memset(self, 0, sizeof(*self)); self->num_nodes = num_nodes; /* Storing segments implies storing pairs */ if (options & TSK_IBD_STORE_SEGMENTS) { self->store_pairs = true; self->store_segments = true; } else if (options & TSK_IBD_STORE_PAIRS) { self->store_pairs = true; } ret = tsk_avl_tree_int_init(&self->pair_map); if (ret != 0) { goto out; } /* Allocate heap memory in 1MiB blocks */ ret = tsk_blkalloc_init(&self->heap, 1024 * 1024); if (ret != 0) { goto out; } out: return ret; } void tsk_identity_segments_print_state(tsk_identity_segments_t *self, FILE *out) { tsk_avl_node_int_t **nodes = tsk_malloc(self->pair_map.size * sizeof(*nodes)); int64_t key; tsk_identity_segment_list_t *value; tsk_identity_segment_t *seg; tsk_size_t j; tsk_id_t a, b; tsk_bug_assert(nodes != NULL); fprintf(out, "===\nIBD Result\n===\n"); fprintf(out, "total_span = %f\n", self->total_span); fprintf(out, "num_segments = %lld\n", (unsigned long long) self->num_segments); fprintf(out, "store_pairs = %d\n", self->store_pairs); fprintf(out, "store_segments = %d\n", self->store_segments); if (self->store_pairs) { fprintf(out, "num_keys = %d\n", (int) self->pair_map.size); tsk_avl_tree_int_ordered_nodes(&self->pair_map, nodes); for (j = 0; j < self->pair_map.size; j++) { key = nodes[j]->key; value = (tsk_identity_segment_list_t *) nodes[j]->value; integer_to_pair(key, self->num_nodes, &a, &b); fprintf(out, "%lld\t(%d,%d) n=%d total_span=%f\t", (long long) key, (int) a, (int) b, (int) value->num_segments, value->total_span); if (self->store_segments) { for (seg = value->head; seg != NULL; seg = seg->next) { fprintf( out, "(%f, %f)->%d, ", seg->left, seg->right, (int) seg->node); } } fprintf(out, "\n"); } } fprintf(out, "Segment memory\n"); tsk_blkalloc_print_state(&self->heap, out); tsk_safe_free(nodes); } tsk_size_t tsk_identity_segments_get_num_segments(const tsk_identity_segments_t *self) { return self->num_segments; } double tsk_identity_segments_get_total_span(const tsk_identity_segments_t *self) { return self->total_span; } tsk_size_t tsk_identity_segments_get_num_pairs(const tsk_identity_segments_t *self) { return self->pair_map.size; } /* Use an inorder traversal on the AVL tree to get the pairs in order. * Recursion is safe here because it's a balanced tree (see the AVL tree * code for notes on this). */ static int get_keys_traverse(tsk_avl_node_int_t *node, int index, tsk_size_t N, tsk_id_t *pairs) { tsk_id_t a, b; if (node == NULL) { return index; } index = get_keys_traverse(node->llink, index, N, pairs); integer_to_pair(node->key, N, &a, &b); pairs[2 * index] = a; pairs[2 * index + 1] = b; return get_keys_traverse(node->rlink, index + 1, N, pairs); } int tsk_identity_segments_get_keys(const tsk_identity_segments_t *self, tsk_id_t *pairs) { if (!self->store_pairs) { return TSK_ERR_IBD_PAIRS_NOT_STORED; } get_keys_traverse( tsk_avl_tree_int_get_root(&self->pair_map), 0, self->num_nodes, pairs); return 0; } static int get_items_traverse(tsk_avl_node_int_t *node, int index, tsk_size_t N, tsk_id_t *pairs, tsk_identity_segment_list_t **lists) { tsk_id_t a, b; if (node == NULL) { return index; } index = get_items_traverse(node->llink, index, N, pairs, lists); integer_to_pair(node->key, N, &a, &b); pairs[2 * index] = a; pairs[2 * index + 1] = b; lists[index] = node->value; return get_items_traverse(node->rlink, index + 1, N, pairs, lists); } int tsk_identity_segments_get_items(const tsk_identity_segments_t *self, tsk_id_t *pairs, tsk_identity_segment_list_t **lists) { if (!self->store_pairs) { return TSK_ERR_IBD_PAIRS_NOT_STORED; } get_items_traverse( tsk_avl_tree_int_get_root(&self->pair_map), 0, self->num_nodes, pairs, lists); return 0; } int tsk_identity_segments_free(tsk_identity_segments_t *self) { tsk_blkalloc_free(&self->heap); tsk_avl_tree_int_free(&self->pair_map); return 0; } static int TSK_WARN_UNUSED tsk_identity_segments_update_pair(tsk_identity_segments_t *self, tsk_id_t a, tsk_id_t b, double left, double right, tsk_id_t node) { int ret = 0; tsk_identity_segment_t *x; tsk_identity_segment_list_t *list; /* skip the error checking here since this an internal API */ int64_t key = pair_to_integer(a, b, self->num_nodes); tsk_avl_node_int_t *avl_node = tsk_avl_tree_int_search(&self->pair_map, key); if (avl_node == NULL) { /* We haven't seen this pair before */ avl_node = tsk_identity_segments_alloc_new_pair(self, key); if (avl_node == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_avl_tree_int_insert(&self->pair_map, avl_node); tsk_bug_assert(ret == 0); } list = (tsk_identity_segment_list_t *) avl_node->value; list->num_segments++; list->total_span += right - left; if (self->store_segments) { x = tsk_identity_segments_alloc_segment(self, left, right, node); if (x == NULL) { goto out; } if (list->tail == NULL) { list->head = x; list->tail = x; } else { list->tail->next = x; list->tail = x; } } out: return ret; } static int TSK_WARN_UNUSED tsk_identity_segments_add_segment(tsk_identity_segments_t *self, tsk_id_t a, tsk_id_t b, double left, double right, tsk_id_t node) { int ret = 0; if (self->store_pairs) { ret = tsk_identity_segments_update_pair(self, a, b, left, right, node); if (ret != 0) { goto out; } } self->total_span += right - left; self->num_segments++; out: return ret; } int TSK_WARN_UNUSED tsk_identity_segments_get(const tsk_identity_segments_t *self, tsk_id_t sample_a, tsk_id_t sample_b, tsk_identity_segment_list_t **ret_list) { int ret = 0; int64_t key = tsk_identity_segments_get_key(self, sample_a, sample_b); tsk_avl_node_int_t *avl_node; if (key < 0) { ret = (int) key; goto out; } if (!self->store_pairs) { ret = tsk_trace_error(TSK_ERR_IBD_PAIRS_NOT_STORED); goto out; } avl_node = tsk_avl_tree_int_search(&self->pair_map, key); *ret_list = NULL; if (avl_node != NULL) { *ret_list = (tsk_identity_segment_list_t *) avl_node->value; } out: return ret; } /************************* * IBD finder *************************/ typedef struct { tsk_identity_segments_t *result; double min_span; double max_time; const tsk_table_collection_t *tables; /* Maps nodes to their sample set IDs. Input samples map to set 0 * in the "within" case. */ tsk_id_t *sample_set_id; /* True if we're finding IBD between sample sets, false otherwise. */ bool finding_between; tsk_segment_t **ancestor_map_head; tsk_segment_t **ancestor_map_tail; tsk_segment_t *segment_queue; tsk_size_t segment_queue_size; tsk_size_t max_segment_queue_size; tsk_blkalloc_t segment_heap; } tsk_ibd_finder_t; static tsk_segment_t *TSK_WARN_UNUSED tsk_ibd_finder_alloc_segment( tsk_ibd_finder_t *self, double left, double right, tsk_id_t node) { tsk_segment_t *seg = NULL; seg = tsk_blkalloc_get(&self->segment_heap, sizeof(*seg)); if (seg == NULL) { goto out; } seg->next = NULL; seg->left = left; seg->right = right; seg->node = node; out: return seg; } static int TSK_WARN_UNUSED tsk_ibd_finder_add_ancestry(tsk_ibd_finder_t *self, tsk_id_t input_id, double left, double right, tsk_id_t output_id) { int ret = 0; tsk_segment_t *tail = self->ancestor_map_tail[input_id]; tsk_segment_t *x = NULL; tsk_bug_assert(left < right); x = tsk_ibd_finder_alloc_segment(self, left, right, output_id); if (x == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (tail == NULL) { self->ancestor_map_head[input_id] = x; self->ancestor_map_tail[input_id] = x; } else { tail->next = x; self->ancestor_map_tail[input_id] = x; } out: return ret; } static int tsk_ibd_finder_init_samples_from_set( tsk_ibd_finder_t *self, const tsk_id_t *samples, tsk_size_t num_samples) { int ret = 0; tsk_size_t j; tsk_id_t u; for (j = 0; j < num_samples; j++) { u = samples[j]; if (u < 0 || u > (tsk_id_t) self->tables->nodes.num_rows) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (self->sample_set_id[u] != TSK_NULL) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } self->sample_set_id[u] = 0; } out: return ret; } static void tsk_ibd_finder_init_samples_from_nodes(tsk_ibd_finder_t *self) { tsk_id_t u; const tsk_id_t num_nodes = (tsk_id_t) self->tables->nodes.num_rows; const tsk_flags_t *restrict flags = self->tables->nodes.flags; for (u = 0; u < num_nodes; u++) { if (flags[u] & TSK_NODE_IS_SAMPLE) { self->sample_set_id[u] = 0; } } } static int tsk_ibd_finder_add_sample_ancestry(tsk_ibd_finder_t *self) { int ret = 0; tsk_id_t u; const tsk_id_t num_nodes = (tsk_id_t) self->tables->nodes.num_rows; const double L = self->tables->sequence_length; for (u = 0; u < num_nodes; u++) { if (self->sample_set_id[u] != TSK_NULL) { ret = tsk_ibd_finder_add_ancestry(self, u, 0, L, u); if (ret != 0) { goto out; } } } out: return ret; } static int TSK_WARN_UNUSED tsk_ibd_finder_init(tsk_ibd_finder_t *self, const tsk_table_collection_t *tables, tsk_identity_segments_t *result, double min_span, double max_time) { int ret = 0; tsk_size_t num_nodes; tsk_memset(self, 0, sizeof(tsk_ibd_finder_t)); if (min_span < 0) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if (max_time < 0) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } self->tables = tables; self->result = result; self->max_time = max_time; self->min_span = min_span; ret = tsk_blkalloc_init(&self->segment_heap, 8192); if (ret != 0) { goto out; } num_nodes = tables->nodes.num_rows; self->ancestor_map_head = tsk_calloc(num_nodes, sizeof(*self->ancestor_map_head)); self->ancestor_map_tail = tsk_calloc(num_nodes, sizeof(*self->ancestor_map_tail)); self->sample_set_id = tsk_malloc(num_nodes * sizeof(*self->sample_set_id)); self->segment_queue_size = 0; self->max_segment_queue_size = 64; self->segment_queue = tsk_malloc(self->max_segment_queue_size * sizeof(*self->segment_queue)); if (self->ancestor_map_head == NULL || self->ancestor_map_tail == NULL || self->sample_set_id == NULL || self->segment_queue == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(self->sample_set_id, TSK_NULL, num_nodes * sizeof(*self->sample_set_id)); out: return ret; } static int TSK_WARN_UNUSED tsk_ibd_finder_enqueue_segment( tsk_ibd_finder_t *self, double left, double right, tsk_id_t node) { int ret = 0; tsk_segment_t *seg; void *p; if ((right - left) > self->min_span) { /* Make sure we always have room for one more segment in the queue so we * can put a tail sentinel on it */ if (self->segment_queue_size == self->max_segment_queue_size - 1) { self->max_segment_queue_size *= 2; p = tsk_realloc(self->segment_queue, self->max_segment_queue_size * sizeof(*self->segment_queue)); if (p == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->segment_queue = p; } seg = self->segment_queue + self->segment_queue_size; seg->left = left; seg->right = right; seg->node = node; self->segment_queue_size++; } out: return ret; } static bool tsk_ibd_finder_passes_filters( const tsk_ibd_finder_t *self, tsk_id_t a, tsk_id_t b, double left, double right) { if (a == b) { return false; } if ((right - left) <= self->min_span) { return false; } if (self->finding_between) { return self->sample_set_id[a] != self->sample_set_id[b]; } else { return true; } } static int TSK_WARN_UNUSED tsk_ibd_finder_record_ibd(tsk_ibd_finder_t *self, tsk_id_t parent) { int ret = 0; tsk_size_t j; tsk_segment_t *seg0, *seg1; double left, right; for (seg0 = self->ancestor_map_head[parent]; seg0 != NULL; seg0 = seg0->next) { for (j = 0; j < self->segment_queue_size; j++) { seg1 = &self->segment_queue[j]; left = TSK_MAX(seg0->left, seg1->left); right = TSK_MIN(seg0->right, seg1->right); if (tsk_ibd_finder_passes_filters( self, seg0->node, seg1->node, left, right)) { ret = tsk_identity_segments_add_segment( self->result, seg0->node, seg1->node, left, right, parent); if (ret != 0) { goto out; } } } } out: return ret; } static int TSK_WARN_UNUSED tsk_ibd_finder_add_queued_ancestry(tsk_ibd_finder_t *self, tsk_id_t parent) { int ret = 0; tsk_size_t j; tsk_segment_t seg; for (j = 0; j < self->segment_queue_size; j++) { seg = self->segment_queue[j]; ret = tsk_ibd_finder_add_ancestry(self, parent, seg.left, seg.right, seg.node); if (ret != 0) { goto out; } } self->segment_queue_size = 0; out: return ret; } static void tsk_ibd_finder_print_state(tsk_ibd_finder_t *self, FILE *out) { tsk_size_t j; tsk_segment_t *u = NULL; fprintf(out, "--ibd-finder stats--\n"); fprintf(out, "max_time = %f\n", self->max_time); fprintf(out, "min_span = %f\n", self->min_span); fprintf(out, "finding_between = %d\n", self->finding_between); fprintf(out, "===\nEdges\n===\n"); for (j = 0; j < self->tables->edges.num_rows; j++) { fprintf(out, "L:%f, R:%f, P:%lld, C:%lld\n", self->tables->edges.left[j], self->tables->edges.right[j], (long long) self->tables->edges.parent[j], (long long) self->tables->edges.child[j]); } fprintf(out, "===\nNodes\n===\n"); for (j = 0; j < self->tables->nodes.num_rows; j++) { fprintf(out, "ID:%d, Time:%f, Flag:%lld Sample set:%d\n", (int) j, self->tables->nodes.time[j], (long long) self->tables->nodes.flags[j], (int) self->sample_set_id[j]); } fprintf(out, "===\nAncestral map\n===\n"); for (j = 0; j < self->tables->nodes.num_rows; j++) { fprintf(out, "Node %lld: ", (long long) j); for (u = self->ancestor_map_head[j]; u != NULL; u = u->next) { fprintf(out, "(%f,%f->%lld)", u->left, u->right, (long long) u->node); } fprintf(out, "\n"); } tsk_identity_segments_print_state(self->result, out); } static int TSK_WARN_UNUSED tsk_ibd_finder_init_within( tsk_ibd_finder_t *self, const tsk_id_t *samples, tsk_size_t num_samples) { int ret; if (samples == NULL) { tsk_ibd_finder_init_samples_from_nodes(self); } else { ret = tsk_ibd_finder_init_samples_from_set(self, samples, num_samples); if (ret != 0) { goto out; } } self->finding_between = false; ret = tsk_ibd_finder_add_sample_ancestry(self); out: return ret; } static int TSK_WARN_UNUSED tsk_ibd_finder_init_between(tsk_ibd_finder_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets) { int ret = 0; tsk_size_t j, k, index; tsk_id_t u; index = 0; for (j = 0; j < num_sample_sets; j++) { for (k = 0; k < sample_set_sizes[j]; k++) { u = sample_sets[index]; if (u < 0 || u > (tsk_id_t) self->tables->nodes.num_rows) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (self->sample_set_id[u] != TSK_NULL) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } self->sample_set_id[u] = (tsk_id_t) j; index++; } } self->finding_between = true; ret = tsk_ibd_finder_add_sample_ancestry(self); out: return ret; } static int TSK_WARN_UNUSED tsk_ibd_finder_run(tsk_ibd_finder_t *self) { const tsk_edge_table_t *input_edges = &self->tables->edges; const tsk_size_t num_edges = input_edges->num_rows; int ret = 0; tsk_size_t j; tsk_segment_t *s; tsk_id_t parent, child; double left, right, intvl_l, intvl_r, time; for (j = 0; j < num_edges; j++) { parent = input_edges->parent[j]; left = input_edges->left[j]; right = input_edges->right[j]; child = input_edges->child[j]; time = self->tables->nodes.time[parent]; if (time > self->max_time) { break; } for (s = self->ancestor_map_head[child]; s != NULL; s = s->next) { intvl_l = TSK_MAX(left, s->left); intvl_r = TSK_MIN(right, s->right); ret = tsk_ibd_finder_enqueue_segment(self, intvl_l, intvl_r, s->node); if (ret != 0) { goto out; } } ret = tsk_ibd_finder_record_ibd(self, parent); if (ret != 0) { goto out; } ret = tsk_ibd_finder_add_queued_ancestry(self, parent); if (ret != 0) { goto out; } } out: return ret; } static int tsk_ibd_finder_free(tsk_ibd_finder_t *self) { tsk_blkalloc_free(&self->segment_heap); tsk_safe_free(self->sample_set_id); tsk_safe_free(self->ancestor_map_head); tsk_safe_free(self->ancestor_map_tail); tsk_safe_free(self->segment_queue); return 0; } /************************* * simplifier *************************/ static void simplifier_check_state(simplifier_t *self) { tsk_size_t j, k; tsk_segment_t *u; mutation_id_list_t *list_node; tsk_id_t site; interval_list_t *int_list; tsk_id_t child; double position, last_position; bool found; tsk_size_t num_intervals; for (j = 0; j < self->input_tables.nodes.num_rows; j++) { tsk_bug_assert((self->ancestor_map_head[j] == NULL) == (self->ancestor_map_tail[j] == NULL)); for (u = self->ancestor_map_head[j]; u != NULL; u = u->next) { tsk_bug_assert(u->left < u->right); if (u->next != NULL) { tsk_bug_assert(u->right <= u->next->left); if (u->right == u->next->left) { tsk_bug_assert(u->node != u->next->node); } } else { tsk_bug_assert(u == self->ancestor_map_tail[j]); } } } for (j = 0; j < self->segment_queue_size; j++) { tsk_bug_assert(self->segment_queue[j].left < self->segment_queue[j].right); } for (j = 0; j < self->input_tables.nodes.num_rows; j++) { last_position = -1; for (list_node = self->node_mutation_list_map_head[j]; list_node != NULL; list_node = list_node->next) { tsk_bug_assert( self->input_tables.mutations.node[list_node->mutation] == (tsk_id_t) j); site = self->input_tables.mutations.site[list_node->mutation]; position = self->input_tables.sites.position[site]; tsk_bug_assert(last_position <= position); last_position = position; } } /* check the buffered edges */ for (j = 0; j < self->input_tables.nodes.num_rows; j++) { tsk_bug_assert((self->child_edge_map_head[j] == NULL) == (self->child_edge_map_tail[j] == NULL)); if (self->child_edge_map_head[j] != NULL) { /* Make sure that the child is in our list */ found = false; for (k = 0; k < self->num_buffered_children; k++) { if (self->buffered_children[k] == (tsk_id_t) j) { found = true; break; } } tsk_bug_assert(found); } } num_intervals = 0; for (j = 0; j < self->num_buffered_children; j++) { child = self->buffered_children[j]; tsk_bug_assert(self->child_edge_map_head[child] != NULL); for (int_list = self->child_edge_map_head[child]; int_list != NULL; int_list = int_list->next) { tsk_bug_assert(int_list->left < int_list->right); if (int_list->next != NULL) { tsk_bug_assert(int_list->right < int_list->next->left); } num_intervals++; } } tsk_bug_assert( num_intervals == self->interval_list_heap.total_allocated / (sizeof(interval_list_t))); } static void print_segment_chain(tsk_segment_t *head, FILE *out) { tsk_segment_t *u; for (u = head; u != NULL; u = u->next) { fprintf(out, "(%f,%f->%lld)", u->left, u->right, (long long) u->node); } } static void simplifier_print_state(simplifier_t *self, FILE *out) { tsk_size_t j; tsk_segment_t *u; mutation_id_list_t *list_node; interval_list_t *int_list; tsk_id_t child; fprintf(out, "--simplifier state--\n"); fprintf(out, "options:\n"); fprintf(out, "\tfilter_unreferenced_sites : %d\n", !!(self->options & TSK_SIMPLIFY_FILTER_SITES)); fprintf(out, "\tno_filter_nodes : %d\n", !!(self->options & TSK_SIMPLIFY_NO_FILTER_NODES)); fprintf(out, "\treduce_to_site_topology : %d\n", !!(self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY)); fprintf(out, "\tkeep_unary : %d\n", !!(self->options & TSK_SIMPLIFY_KEEP_UNARY)); fprintf(out, "\tkeep_input_roots : %d\n", !!(self->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS)); fprintf(out, "\tkeep_unary_in_individuals : %d\n", !!(self->options & TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS)); fprintf(out, "===\nInput tables\n==\n"); tsk_table_collection_print_state(&self->input_tables, out); fprintf(out, "===\nOutput tables\n==\n"); tsk_table_collection_print_state(self->tables, out); fprintf(out, "===\nmemory heaps\n==\n"); fprintf(out, "segment_heap:\n"); tsk_blkalloc_print_state(&self->segment_heap, out); fprintf(out, "interval_list_heap:\n"); tsk_blkalloc_print_state(&self->interval_list_heap, out); fprintf(out, "===\nancestors\n==\n"); for (j = 0; j < self->input_tables.nodes.num_rows; j++) { fprintf(out, "%lld:\t", (long long) j); print_segment_chain(self->ancestor_map_head[j], out); fprintf(out, "\n"); } fprintf(out, "===\nnode_id map (input->output)\n==\n"); for (j = 0; j < self->input_tables.nodes.num_rows; j++) { if (self->node_id_map[j] != TSK_NULL) { fprintf( out, "%lld->%lld\n", (long long) j, (long long) self->node_id_map[j]); } } fprintf(out, "===\nsegment queue\n==\n"); for (j = 0; j < self->segment_queue_size; j++) { u = &self->segment_queue[j]; fprintf(out, "(%f,%f->%lld)", u->left, u->right, (long long) u->node); fprintf(out, "\n"); } fprintf(out, "===\nbuffered children\n==\n"); for (j = 0; j < self->num_buffered_children; j++) { child = self->buffered_children[j]; fprintf(out, "%lld -> ", (long long) j); for (int_list = self->child_edge_map_head[child]; int_list != NULL; int_list = int_list->next) { fprintf(out, "(%f, %f), ", int_list->left, int_list->right); } fprintf(out, "\n"); } fprintf(out, "===\nmutation node map\n==\n"); for (j = 0; j < self->input_tables.mutations.num_rows; j++) { fprintf(out, "%lld\t-> %lld\n", (long long) j, (long long) self->mutation_node_map[j]); } fprintf(out, "===\nnode mutation id list map\n==\n"); for (j = 0; j < self->input_tables.nodes.num_rows; j++) { if (self->node_mutation_list_map_head[j] != NULL) { fprintf(out, "%lld\t-> [", (long long) j); for (list_node = self->node_mutation_list_map_head[j]; list_node != NULL; list_node = list_node->next) { fprintf(out, "%lld,", (long long) list_node->mutation); } fprintf(out, "]\n"); } } if (!!(self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY)) { fprintf(out, "===\nposition_lookup\n==\n"); for (j = 0; j < self->input_tables.sites.num_rows + 2; j++) { fprintf(out, "%lld\t-> %f\n", (long long) j, self->position_lookup[j]); } } simplifier_check_state(self); } static tsk_segment_t *TSK_WARN_UNUSED simplifier_alloc_segment(simplifier_t *self, double left, double right, tsk_id_t node) { tsk_segment_t *seg = NULL; seg = tsk_blkalloc_get(&self->segment_heap, sizeof(*seg)); if (seg == NULL) { goto out; } seg->next = NULL; seg->left = left; seg->right = right; seg->node = node; out: return seg; } static interval_list_t *TSK_WARN_UNUSED simplifier_alloc_interval_list(simplifier_t *self, double left, double right) { interval_list_t *x = NULL; x = tsk_blkalloc_get(&self->interval_list_heap, sizeof(*x)); if (x == NULL) { goto out; } x->next = NULL; x->left = left; x->right = right; out: return x; } /* Add a new node to the output node table corresponding to the specified input id. * Returns the new ID. */ static tsk_id_t TSK_WARN_UNUSED simplifier_record_node(simplifier_t *self, tsk_id_t input_id) { tsk_node_t node; bool update_flags = !(self->options & TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS); tsk_node_table_get_row_unsafe(&self->input_tables.nodes, (tsk_id_t) input_id, &node); if (update_flags) { /* Zero out the sample bit */ node.flags &= (tsk_flags_t) ~TSK_NODE_IS_SAMPLE; if (self->is_sample[input_id]) { node.flags |= TSK_NODE_IS_SAMPLE; } } self->node_id_map[input_id] = (tsk_id_t) self->tables->nodes.num_rows; return tsk_node_table_add_row(&self->tables->nodes, node.flags, node.time, node.population, node.individual, node.metadata, node.metadata_length); } /* Remove the mapping for the last recorded node. */ static int simplifier_rewind_node(simplifier_t *self, tsk_id_t input_id, tsk_id_t output_id) { self->node_id_map[input_id] = TSK_NULL; return tsk_node_table_truncate(&self->tables->nodes, (tsk_size_t) output_id); } static int simplifier_flush_edges(simplifier_t *self, tsk_id_t parent, tsk_size_t *ret_num_edges) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_id_t child; interval_list_t *x; tsk_size_t num_edges = 0; qsort(self->buffered_children, (size_t) self->num_buffered_children, sizeof(tsk_id_t), cmp_node_id); for (j = 0; j < self->num_buffered_children; j++) { child = self->buffered_children[j]; for (x = self->child_edge_map_head[child]; x != NULL; x = x->next) { ret_id = tsk_edge_table_add_row( &self->tables->edges, x->left, x->right, parent, child, NULL, 0); if (ret_id < 0) { ret = (int) ret_id; goto out; } num_edges++; } self->child_edge_map_head[child] = NULL; self->child_edge_map_tail[child] = NULL; } self->num_buffered_children = 0; *ret_num_edges = num_edges; ret = tsk_blkalloc_reset(&self->interval_list_heap); out: return ret; } /* When we are reducing topology down to what is visible at the sites we need a * lookup table to find the closest site position for each edge. We do this with * a sorted array and binary search */ static int simplifier_init_position_lookup(simplifier_t *self) { int ret = 0; tsk_size_t num_sites = self->input_tables.sites.num_rows; self->position_lookup = tsk_malloc((num_sites + 2) * sizeof(*self->position_lookup)); if (self->position_lookup == NULL) { goto out; } self->position_lookup[0] = 0; self->position_lookup[num_sites + 1] = self->input_tables.sequence_length; tsk_memcpy(self->position_lookup + 1, self->input_tables.sites.position, num_sites * sizeof(double)); out: return ret; } /* * Find the smallest site position index greater than or equal to left * and right, i.e., slide each endpoint of an interval to the right * until they hit a site position. If both left and right map to the * the same position then we discard this edge. We also discard an * edge if left = 0 and right is less than the first site position. */ static bool simplifier_map_reduced_coordinates(simplifier_t *self, double *left, double *right) { double *X = self->position_lookup; tsk_size_t N = self->input_tables.sites.num_rows + 2; tsk_size_t left_index, right_index; bool skip = false; left_index = tsk_search_sorted(X, N, *left); right_index = tsk_search_sorted(X, N, *right); if (left_index == right_index || (left_index == 0 && right_index == 1)) { skip = true; } else { /* Remap back to zero if the left end maps to the first site. */ if (left_index == 1) { left_index = 0; } *left = X[left_index]; *right = X[right_index]; } return skip; } /* Records the specified edge for the current parent by buffering it */ static int simplifier_record_edge(simplifier_t *self, double left, double right, tsk_id_t child) { int ret = 0; interval_list_t *tail, *x; bool skip; if (self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY) { skip = simplifier_map_reduced_coordinates(self, &left, &right); /* NOTE: we exit early here when reduce_coordindates has told us to * skip this edge, as it is not visible in the reduced tree sequence */ if (skip) { goto out; } } tail = self->child_edge_map_tail[child]; if (tail == NULL) { tsk_bug_assert(self->num_buffered_children < self->input_tables.nodes.num_rows); self->buffered_children[self->num_buffered_children] = child; self->num_buffered_children++; x = simplifier_alloc_interval_list(self, left, right); if (x == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->child_edge_map_head[child] = x; self->child_edge_map_tail[child] = x; } else { if (tail->right == left) { tail->right = right; } else { x = simplifier_alloc_interval_list(self, left, right); if (x == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tail->next = x; self->child_edge_map_tail[child] = x; } } out: return ret; } static int simplifier_init_sites(simplifier_t *self) { int ret = 0; tsk_id_t node; mutation_id_list_t *list_node; tsk_size_t j; self->mutation_node_map = tsk_calloc(self->input_tables.mutations.num_rows, sizeof(tsk_id_t)); self->node_mutation_list_mem = tsk_malloc(self->input_tables.mutations.num_rows * sizeof(mutation_id_list_t)); self->node_mutation_list_map_head = tsk_calloc(self->input_tables.nodes.num_rows, sizeof(mutation_id_list_t *)); self->node_mutation_list_map_tail = tsk_calloc(self->input_tables.nodes.num_rows, sizeof(mutation_id_list_t *)); if (self->mutation_node_map == NULL || self->node_mutation_list_mem == NULL || self->node_mutation_list_map_head == NULL || self->node_mutation_list_map_tail == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(self->mutation_node_map, 0xff, self->input_tables.mutations.num_rows * sizeof(tsk_id_t)); for (j = 0; j < self->input_tables.mutations.num_rows; j++) { node = self->input_tables.mutations.node[j]; list_node = self->node_mutation_list_mem + j; list_node->mutation = (tsk_id_t) j; list_node->next = NULL; if (self->node_mutation_list_map_head[node] == NULL) { self->node_mutation_list_map_head[node] = list_node; } else { self->node_mutation_list_map_tail[node]->next = list_node; } self->node_mutation_list_map_tail[node] = list_node; } out: return ret; } static void simplifier_map_mutations( simplifier_t *self, tsk_id_t input_id, double left, double right, tsk_id_t output_id) { mutation_id_list_t *m_node; double position; tsk_id_t site; m_node = self->node_mutation_list_map_head[input_id]; while (m_node != NULL) { site = self->input_tables.mutations.site[m_node->mutation]; position = self->input_tables.sites.position[site]; if (left <= position && position < right) { self->mutation_node_map[m_node->mutation] = output_id; } m_node = m_node->next; } } static int TSK_WARN_UNUSED simplifier_add_ancestry( simplifier_t *self, tsk_id_t input_id, double left, double right, tsk_id_t output_id) { int ret = 0; tsk_segment_t *tail = self->ancestor_map_tail[input_id]; tsk_segment_t *x; tsk_bug_assert(left < right); if (tail == NULL) { x = simplifier_alloc_segment(self, left, right, output_id); if (x == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->ancestor_map_head[input_id] = x; self->ancestor_map_tail[input_id] = x; } else { if (tail->right == left && tail->node == output_id) { tail->right = right; } else { x = simplifier_alloc_segment(self, left, right, output_id); if (x == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tail->next = x; self->ancestor_map_tail[input_id] = x; } } simplifier_map_mutations(self, input_id, left, right, output_id); out: return ret; } /* Sets up the internal working copies of the various tables, as needed * depending on the specified options. */ static int simplifier_init_tables(simplifier_t *self) { int ret; bool filter_nodes = !(self->options & TSK_SIMPLIFY_NO_FILTER_NODES); bool filter_populations = self->options & TSK_SIMPLIFY_FILTER_POPULATIONS; bool filter_individuals = self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS; bool filter_sites = self->options & TSK_SIMPLIFY_FILTER_SITES; tsk_bookmark_t rows_to_retain; /* NOTE: this is a bit inefficient here as we're taking copies of * the tables even in the no-filter case where the original tables * won't be touched (beyond references to external tables that may * need updating). Future versions may do something a bit more * complicated like temporarily stealing the pointers to the * underlying column memory in these tables, and then being careful * not to free the table at the end. */ ret = tsk_table_collection_copy(self->tables, &self->input_tables, 0); if (ret != 0) { goto out; } memset(&rows_to_retain, 0, sizeof(rows_to_retain)); rows_to_retain.provenances = self->tables->provenances.num_rows; if (!filter_nodes) { rows_to_retain.nodes = self->tables->nodes.num_rows; } if (!filter_populations) { rows_to_retain.populations = self->tables->populations.num_rows; } if (!filter_individuals) { rows_to_retain.individuals = self->tables->individuals.num_rows; } if (!filter_sites) { rows_to_retain.sites = self->tables->sites.num_rows; } ret = tsk_table_collection_truncate(self->tables, &rows_to_retain); if (ret != 0) { goto out; } out: return ret; } static int simplifier_init_nodes(simplifier_t *self, const tsk_id_t *samples) { int ret = 0; tsk_id_t node_id; tsk_size_t j; const tsk_size_t num_nodes = self->input_tables.nodes.num_rows; bool filter_nodes = !(self->options & TSK_SIMPLIFY_NO_FILTER_NODES); bool update_flags = !(self->options & TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS); tsk_flags_t *node_flags = self->tables->nodes.flags; tsk_id_t *node_id_map = self->node_id_map; if (filter_nodes) { tsk_bug_assert(self->tables->nodes.num_rows == 0); /* The node table has been cleared. Add nodes for the samples. */ for (j = 0; j < self->num_samples; j++) { node_id = simplifier_record_node(self, samples[j]); if (node_id < 0) { ret = (int) node_id; goto out; } } } else { tsk_bug_assert(self->tables->nodes.num_rows == num_nodes); if (update_flags) { for (j = 0; j < num_nodes; j++) { /* Reset the sample flags */ node_flags[j] &= (tsk_flags_t) ~TSK_NODE_IS_SAMPLE; if (self->is_sample[j]) { node_flags[j] |= TSK_NODE_IS_SAMPLE; } } } for (j = 0; j < num_nodes; j++) { node_id_map[j] = (tsk_id_t) j; } } /* Add the initial ancestry */ for (j = 0; j < self->num_samples; j++) { node_id = samples[j]; ret = simplifier_add_ancestry(self, node_id, 0, self->input_tables.sequence_length, self->node_id_map[node_id]); if (ret != 0) { goto out; } } out: return ret; } static int simplifier_init(simplifier_t *self, const tsk_id_t *samples, tsk_size_t num_samples, tsk_table_collection_t *tables, tsk_flags_t options) { int ret = 0; tsk_size_t j; tsk_id_t ret_id; tsk_size_t num_nodes; tsk_memset(self, 0, sizeof(simplifier_t)); self->num_samples = num_samples; self->options = options; self->tables = tables; /* TODO we can add a flag to skip these checks for when we know they are * unnecessary */ /* TODO Current unit tests require TSK_CHECK_SITE_DUPLICATES but it's * debateable whether we need it. If we remove, we definitely need explicit * tests to ensure we're doing sensible things with duplicate sites. * (Particularly, re TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY.) */ ret_id = tsk_table_collection_check_integrity(tables, TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING | TSK_CHECK_SITE_DUPLICATES); if (ret_id != 0) { ret = (int) ret_id; goto out; } /* Allocate the heaps used for small objects-> Assuming 8K is a good chunk size */ ret = tsk_blkalloc_init(&self->segment_heap, 8192); if (ret != 0) { goto out; } ret = tsk_blkalloc_init(&self->interval_list_heap, 8192); if (ret != 0) { goto out; } ret = segment_overlapper_alloc(&self->segment_overlapper); if (ret != 0) { goto out; } num_nodes = tables->nodes.num_rows; /* Make the maps and set the intial state */ self->ancestor_map_head = tsk_calloc(num_nodes, sizeof(tsk_segment_t *)); self->ancestor_map_tail = tsk_calloc(num_nodes, sizeof(tsk_segment_t *)); self->child_edge_map_head = tsk_calloc(num_nodes, sizeof(interval_list_t *)); self->child_edge_map_tail = tsk_calloc(num_nodes, sizeof(interval_list_t *)); self->node_id_map = tsk_malloc(num_nodes * sizeof(tsk_id_t)); self->buffered_children = tsk_malloc(num_nodes * sizeof(tsk_id_t)); self->is_sample = tsk_calloc(num_nodes, sizeof(bool)); self->max_segment_queue_size = 64; self->segment_queue = tsk_malloc(self->max_segment_queue_size * sizeof(tsk_segment_t)); if (self->ancestor_map_head == NULL || self->ancestor_map_tail == NULL || self->child_edge_map_head == NULL || self->child_edge_map_tail == NULL || self->node_id_map == NULL || self->is_sample == NULL || self->segment_queue == NULL || self->buffered_children == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } /* Go through the samples to check for errors before we clear the tables. */ for (j = 0; j < self->num_samples; j++) { if (samples[j] < 0 || samples[j] >= (tsk_id_t) num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (self->is_sample[samples[j]]) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } self->is_sample[samples[j]] = true; } tsk_memset(self->node_id_map, 0xff, num_nodes * sizeof(tsk_id_t)); ret = simplifier_init_tables(self); if (ret != 0) { goto out; } ret = simplifier_init_sites(self); if (ret != 0) { goto out; } ret = simplifier_init_nodes(self, samples); if (ret != 0) { goto out; } if (self->options & TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY) { ret = simplifier_init_position_lookup(self); if (ret != 0) { goto out; } } self->edge_sort_offset = TSK_NULL; out: return ret; } static int simplifier_free(simplifier_t *self) { tsk_table_collection_free(&self->input_tables); tsk_blkalloc_free(&self->segment_heap); tsk_blkalloc_free(&self->interval_list_heap); segment_overlapper_free(&self->segment_overlapper); tsk_safe_free(self->ancestor_map_head); tsk_safe_free(self->ancestor_map_tail); tsk_safe_free(self->child_edge_map_head); tsk_safe_free(self->child_edge_map_tail); tsk_safe_free(self->node_id_map); tsk_safe_free(self->segment_queue); tsk_safe_free(self->is_sample); tsk_safe_free(self->mutation_node_map); tsk_safe_free(self->node_mutation_list_mem); tsk_safe_free(self->node_mutation_list_map_head); tsk_safe_free(self->node_mutation_list_map_tail); tsk_safe_free(self->buffered_children); tsk_safe_free(self->position_lookup); return 0; } static int TSK_WARN_UNUSED simplifier_enqueue_segment(simplifier_t *self, double left, double right, tsk_id_t node) { int ret = 0; tsk_segment_t *seg; void *p; tsk_bug_assert(left < right); /* Make sure we always have room for one more segment in the queue so we * can put a tail sentinel on it */ if (self->segment_queue_size == self->max_segment_queue_size - 1) { self->max_segment_queue_size *= 2; p = tsk_realloc(self->segment_queue, self->max_segment_queue_size * sizeof(*self->segment_queue)); if (p == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } self->segment_queue = p; } seg = self->segment_queue + self->segment_queue_size; seg->left = left; seg->right = right; seg->node = node; self->segment_queue_size++; out: return ret; } static int TSK_WARN_UNUSED simplifier_merge_ancestors(simplifier_t *self, tsk_id_t input_id) { int ret = 0; tsk_segment_t **X, *x; tsk_size_t j, num_overlapping, num_flushed_edges; double left, right, prev_right; tsk_id_t ancestry_node; tsk_id_t output_id = self->node_id_map[input_id]; bool is_sample = self->is_sample[input_id]; bool filter_nodes = !(self->options & TSK_SIMPLIFY_NO_FILTER_NODES); bool keep_unary = self->options & TSK_SIMPLIFY_KEEP_UNARY; if ((self->options & TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS) && (self->input_tables.nodes.individual[input_id] != TSK_NULL)) { keep_unary = true; } if (is_sample) { /* Free up the existing ancestry mapping. */ x = self->ancestor_map_tail[input_id]; tsk_bug_assert(x->left == 0 && x->right == self->tables->sequence_length); self->ancestor_map_head[input_id] = NULL; self->ancestor_map_tail[input_id] = NULL; } ret = segment_overlapper_start( &self->segment_overlapper, self->segment_queue, self->segment_queue_size); if (ret != 0) { goto out; } prev_right = 0; while ((ret = segment_overlapper_next( &self->segment_overlapper, &left, &right, &X, &num_overlapping)) == 1) { tsk_bug_assert(left < right); tsk_bug_assert(num_overlapping > 0); if (num_overlapping == 1) { ancestry_node = X[0]->node; if (is_sample) { ret = simplifier_record_edge(self, left, right, ancestry_node); if (ret != 0) { goto out; } ancestry_node = output_id; } else if (keep_unary) { if (output_id == TSK_NULL) { output_id = simplifier_record_node(self, input_id); } ret = simplifier_record_edge(self, left, right, ancestry_node); if (ret != 0) { goto out; } } } else { if (output_id == TSK_NULL) { output_id = simplifier_record_node(self, input_id); if (output_id < 0) { ret = (int) output_id; goto out; } } ancestry_node = output_id; for (j = 0; j < num_overlapping; j++) { ret = simplifier_record_edge(self, left, right, X[j]->node); if (ret != 0) { goto out; } } } if (is_sample && left != prev_right) { /* Fill in any gaps in ancestry for the sample */ ret = simplifier_add_ancestry(self, input_id, prev_right, left, output_id); if (ret != 0) { goto out; } } if (keep_unary) { ancestry_node = output_id; } ret = simplifier_add_ancestry(self, input_id, left, right, ancestry_node); if (ret != 0) { goto out; } prev_right = right; } /* Check for errors occuring in the loop condition */ if (ret != 0) { goto out; } if (is_sample && prev_right != self->tables->sequence_length) { /* If a trailing gap exists in the sample ancestry, fill it in. */ ret = simplifier_add_ancestry( self, input_id, prev_right, self->tables->sequence_length, output_id); if (ret != 0) { goto out; } } if (output_id != TSK_NULL) { ret = simplifier_flush_edges(self, output_id, &num_flushed_edges); if (ret != 0) { goto out; } if (filter_nodes && (num_flushed_edges == 0) && !is_sample) { ret = simplifier_rewind_node(self, input_id, output_id); } } out: return ret; } /* Extract the ancestry for the specified input node over the specified * interval and queue it up for merging. */ static int TSK_WARN_UNUSED simplifier_extract_ancestry( simplifier_t *self, double left, double right, tsk_id_t input_id) { int ret = 0; tsk_segment_t *x = self->ancestor_map_head[input_id]; tsk_segment_t y; /* y is the segment that has been removed */ tsk_segment_t *x_head, *x_prev, *seg_left, *seg_right; x_head = NULL; x_prev = NULL; while (x != NULL) { if (x->right > left && right > x->left) { y.left = TSK_MAX(x->left, left); y.right = TSK_MIN(x->right, right); y.node = x->node; ret = simplifier_enqueue_segment(self, y.left, y.right, y.node); if (ret != 0) { goto out; } seg_left = NULL; seg_right = NULL; if (x->left != y.left) { seg_left = simplifier_alloc_segment(self, x->left, y.left, x->node); if (seg_left == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (x_prev == NULL) { x_head = seg_left; } else { x_prev->next = seg_left; } x_prev = seg_left; } if (x->right != y.right) { x->left = y.right; seg_right = x; } else { seg_right = x->next; // TODO free x } if (x_prev == NULL) { x_head = seg_right; } else { x_prev->next = seg_right; } x = seg_right; } else { if (x_prev == NULL) { x_head = x; } x_prev = x; x = x->next; } } self->ancestor_map_head[input_id] = x_head; self->ancestor_map_tail[input_id] = x_prev; out: return ret; } static int TSK_WARN_UNUSED simplifier_process_parent_edges( simplifier_t *self, tsk_id_t parent, tsk_size_t start, tsk_size_t end) { int ret = 0; tsk_size_t j; const tsk_edge_table_t *input_edges = &self->input_tables.edges; tsk_id_t child; double left, right; /* Go through the edges and queue up ancestry segments for processing. */ self->segment_queue_size = 0; for (j = start; j < end; j++) { tsk_bug_assert(parent == input_edges->parent[j]); child = input_edges->child[j]; left = input_edges->left[j]; right = input_edges->right[j]; ret = simplifier_extract_ancestry(self, left, right, child); if (ret != 0) { goto out; } } /* We can now merge the ancestral segments for the parent */ ret = simplifier_merge_ancestors(self, parent); if (ret != 0) { goto out; } out: return ret; } static int TSK_WARN_UNUSED simplifier_finalise_site_references( simplifier_t *self, const bool *site_referenced, tsk_id_t *site_id_map) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_site_t site; const tsk_size_t num_sites = self->input_tables.sites.num_rows; if (self->options & TSK_SIMPLIFY_FILTER_SITES) { for (j = 0; j < num_sites; j++) { tsk_site_table_get_row_unsafe( &self->input_tables.sites, (tsk_id_t) j, &site); site_id_map[j] = TSK_NULL; if (site_referenced[j]) { ret_id = tsk_site_table_add_row(&self->tables->sites, site.position, site.ancestral_state, site.ancestral_state_length, site.metadata, site.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } site_id_map[j] = ret_id; } } } else { tsk_bug_assert(self->tables->sites.num_rows == num_sites); for (j = 0; j < num_sites; j++) { site_id_map[j] = (tsk_id_t) j; } } out: return ret; } static int TSK_WARN_UNUSED simplifier_finalise_population_references(simplifier_t *self) { int ret = 0; tsk_size_t j; tsk_id_t pop_id, ret_id; tsk_population_t pop; tsk_id_t *node_population = self->tables->nodes.population; const tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_size_t num_populations = self->input_tables.populations.num_rows; bool *population_referenced = tsk_calloc(num_populations, sizeof(*population_referenced)); tsk_id_t *population_id_map = tsk_malloc(num_populations * sizeof(*population_id_map)); tsk_bug_assert(self->options & TSK_SIMPLIFY_FILTER_POPULATIONS); if (population_referenced == NULL || population_id_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < num_nodes; j++) { pop_id = node_population[j]; if (pop_id != TSK_NULL) { population_referenced[pop_id] = true; } } for (j = 0; j < num_populations; j++) { tsk_population_table_get_row_unsafe( &self->input_tables.populations, (tsk_id_t) j, &pop); population_id_map[j] = TSK_NULL; if (population_referenced[j]) { ret_id = tsk_population_table_add_row( &self->tables->populations, pop.metadata, pop.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } population_id_map[j] = ret_id; } } /* Remap the IDs in the node table */ for (j = 0; j < num_nodes; j++) { pop_id = node_population[j]; if (pop_id != TSK_NULL) { node_population[j] = population_id_map[pop_id]; } } out: tsk_safe_free(population_id_map); tsk_safe_free(population_referenced); return ret; } static int TSK_WARN_UNUSED simplifier_finalise_individual_references(simplifier_t *self) { int ret = 0; tsk_size_t j; tsk_id_t pop_id, ret_id; tsk_individual_t ind; tsk_id_t *node_individual = self->tables->nodes.individual; tsk_id_t *parents; const tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_size_t num_individuals = self->input_tables.individuals.num_rows; bool *individual_referenced = tsk_calloc(num_individuals, sizeof(*individual_referenced)); tsk_id_t *individual_id_map = tsk_malloc(num_individuals * sizeof(*individual_id_map)); tsk_bug_assert(self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS); if (individual_referenced == NULL || individual_id_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < num_nodes; j++) { pop_id = node_individual[j]; if (pop_id != TSK_NULL) { individual_referenced[pop_id] = true; } } for (j = 0; j < num_individuals; j++) { tsk_individual_table_get_row_unsafe( &self->input_tables.individuals, (tsk_id_t) j, &ind); individual_id_map[j] = TSK_NULL; if (individual_referenced[j]) { /* Can't remap the parents inline here because we have no * guarantees about sortedness */ ret_id = tsk_individual_table_add_row(&self->tables->individuals, ind.flags, ind.location, ind.location_length, ind.parents, ind.parents_length, ind.metadata, ind.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } individual_id_map[j] = ret_id; } } /* Remap the IDs in the node table */ for (j = 0; j < num_nodes; j++) { pop_id = node_individual[j]; if (pop_id != TSK_NULL) { node_individual[j] = individual_id_map[pop_id]; } } /* Remap parent IDs. * * NOTE! must take the pointer reference here as it can change from * the start of the function */ parents = self->tables->individuals.parents; for (j = 0; j < self->tables->individuals.parents_length; j++) { if (parents[j] != TSK_NULL) { parents[j] = individual_id_map[parents[j]]; } } out: tsk_safe_free(individual_id_map); tsk_safe_free(individual_referenced); return ret; } static int TSK_WARN_UNUSED simplifier_output_sites(simplifier_t *self) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; tsk_mutation_t mutation; const tsk_size_t num_sites = self->input_tables.sites.num_rows; const tsk_size_t num_mutations = self->input_tables.mutations.num_rows; bool *site_referenced = tsk_calloc(num_sites, sizeof(*site_referenced)); tsk_id_t *site_id_map = tsk_malloc(num_sites * sizeof(*site_id_map)); tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map)); const tsk_id_t *mutation_node_map = self->mutation_node_map; const tsk_id_t *mutation_site = self->input_tables.mutations.site; if (site_referenced == NULL || site_id_map == NULL || mutation_id_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < num_mutations; j++) { if (mutation_node_map[j] != TSK_NULL) { site_referenced[mutation_site[j]] = true; } } ret = simplifier_finalise_site_references(self, site_referenced, site_id_map); if (ret != 0) { goto out; } for (j = 0; j < num_mutations; j++) { mutation_id_map[j] = TSK_NULL; if (mutation_node_map[j] != TSK_NULL) { tsk_mutation_table_get_row_unsafe( &self->input_tables.mutations, (tsk_id_t) j, &mutation); mutation.node = mutation_node_map[j]; mutation.site = site_id_map[mutation.site]; if (mutation.parent != TSK_NULL) { mutation.parent = mutation_id_map[mutation.parent]; } ret_id = tsk_mutation_table_add_row(&self->tables->mutations, mutation.site, mutation.node, mutation.parent, mutation.time, mutation.derived_state, mutation.derived_state_length, mutation.metadata, mutation.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } mutation_id_map[j] = ret_id; } } out: tsk_safe_free(site_referenced); tsk_safe_free(site_id_map); tsk_safe_free(mutation_id_map); return ret; } /* Flush the remaining non-edge and node data in the model to the * output tables. */ static int TSK_WARN_UNUSED simplifier_flush_output(simplifier_t *self) { int ret = 0; /* TODO Migrations fit reasonably neatly into the pattern that we have here. We * can consider references to populations from migration objects in the same way * as from nodes, so that we only remove a population if its referenced by * neither. Mapping the population IDs in migrations is then easy. In principle * nodes are similar, but the semantics are slightly different because we've * already allocated all the nodes by their references from edges. We then * need to decide whether we remove migrations that reference unmapped nodes * or whether to add these nodes back in (probably the former is the correct * approach).*/ if (self->input_tables.migrations.num_rows != 0) { ret = tsk_trace_error(TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED); goto out; } ret = simplifier_output_sites(self); if (ret != 0) { goto out; } if (self->options & TSK_SIMPLIFY_FILTER_POPULATIONS) { ret = simplifier_finalise_population_references(self); if (ret != 0) { goto out; } } if (self->options & TSK_SIMPLIFY_FILTER_INDIVIDUALS) { ret = simplifier_finalise_individual_references(self); if (ret != 0) { goto out; } } out: return ret; } static void simplifier_set_edge_sort_offset(simplifier_t *self, double youngest_root_time) { const tsk_edge_table_t edges = self->tables->edges; const double *node_time = self->tables->nodes.time; int64_t offset; for (offset = 0; offset < (int64_t) edges.num_rows; offset++) { if (node_time[edges.parent[offset]] >= youngest_root_time) { break; } } self->edge_sort_offset = offset; } static int TSK_WARN_UNUSED simplifier_sort_edges(simplifier_t *self) { /* designated initialisers are guaranteed to set any missing fields to * zero, so we don't need to set the rest of them. */ tsk_bookmark_t bookmark = { .edges = (tsk_size_t) self->edge_sort_offset, .sites = self->tables->sites.num_rows, .mutations = self->tables->mutations.num_rows, }; tsk_bug_assert(self->edge_sort_offset >= 0); return tsk_table_collection_sort(self->tables, &bookmark, 0); } static int TSK_WARN_UNUSED simplifier_insert_input_roots(simplifier_t *self) { int ret = 0; tsk_id_t input_id, output_id; tsk_segment_t *x; tsk_size_t num_flushed_edges; double youngest_root_time = DBL_MAX; const double *node_time = self->tables->nodes.time; for (input_id = 0; input_id < (tsk_id_t) self->input_tables.nodes.num_rows; input_id++) { x = self->ancestor_map_head[input_id]; if (x != NULL) { output_id = self->node_id_map[input_id]; if (output_id == TSK_NULL) { output_id = simplifier_record_node(self, input_id); if (output_id < 0) { ret = (int) output_id; goto out; } } youngest_root_time = TSK_MIN(youngest_root_time, node_time[output_id]); while (x != NULL) { if (x->node != output_id) { ret = simplifier_record_edge(self, x->left, x->right, x->node); if (ret != 0) { goto out; } simplifier_map_mutations( self, input_id, x->left, x->right, output_id); } x = x->next; } ret = simplifier_flush_edges(self, output_id, &num_flushed_edges); if (ret != 0) { goto out; } } } if (youngest_root_time != DBL_MAX) { simplifier_set_edge_sort_offset(self, youngest_root_time); } out: return ret; } static int TSK_WARN_UNUSED simplifier_run(simplifier_t *self, tsk_id_t *node_map) { int ret = 0; tsk_size_t j, start; tsk_id_t parent, current_parent; const tsk_edge_table_t *input_edges = &self->input_tables.edges; tsk_size_t num_edges = input_edges->num_rows; if (num_edges > 0) { start = 0; current_parent = input_edges->parent[0]; for (j = 0; j < num_edges; j++) { parent = input_edges->parent[j]; if (parent != current_parent) { ret = simplifier_process_parent_edges(self, current_parent, start, j); if (ret != 0) { goto out; } current_parent = parent; start = j; } } ret = simplifier_process_parent_edges(self, current_parent, start, num_edges); if (ret != 0) { goto out; } } if (self->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS) { ret = simplifier_insert_input_roots(self); if (ret != 0) { goto out; } } ret = simplifier_flush_output(self); if (ret != 0) { goto out; } if (node_map != NULL) { /* Finally, output the new IDs for the nodes, if required. */ tsk_memcpy(node_map, self->node_id_map, self->input_tables.nodes.num_rows * sizeof(tsk_id_t)); } if (self->edge_sort_offset != TSK_NULL) { tsk_bug_assert(self->options & TSK_SIMPLIFY_KEEP_INPUT_ROOTS); ret = simplifier_sort_edges(self); if (ret != 0) { goto out; } } out: return ret; } /************************* * table_collection *************************/ typedef struct { tsk_id_t index; /* These are the sort keys in order */ double first; double second; tsk_id_t third; tsk_id_t fourth; } index_sort_t; static int cmp_index_sort(const void *a, const void *b) { const index_sort_t *ca = (const index_sort_t *) a; const index_sort_t *cb = (const index_sort_t *) b; int ret = (ca->first > cb->first) - (ca->first < cb->first); if (ret == 0) { ret = (ca->second > cb->second) - (ca->second < cb->second); if (ret == 0) { ret = (ca->third > cb->third) - (ca->third < cb->third); if (ret == 0) { ret = (ca->fourth > cb->fourth) - (ca->fourth < cb->fourth); } } } return ret; } static int tsk_table_collection_check_offsets(const tsk_table_collection_t *self) { int ret = 0; ret = check_offsets(self->nodes.num_rows, self->nodes.metadata_offset, self->nodes.metadata_length, true); if (ret != 0) { goto out; } ret = check_offsets(self->sites.num_rows, self->sites.ancestral_state_offset, self->sites.ancestral_state_length, true); if (ret != 0) { goto out; } ret = check_offsets(self->sites.num_rows, self->sites.metadata_offset, self->sites.metadata_length, true); if (ret != 0) { goto out; } ret = check_offsets(self->mutations.num_rows, self->mutations.derived_state_offset, self->mutations.derived_state_length, true); if (ret != 0) { goto out; } ret = check_offsets(self->mutations.num_rows, self->mutations.metadata_offset, self->mutations.metadata_length, true); if (ret != 0) { goto out; } ret = check_offsets(self->individuals.num_rows, self->individuals.metadata_offset, self->individuals.metadata_length, true); if (ret != 0) { goto out; } ret = check_offsets(self->provenances.num_rows, self->provenances.timestamp_offset, self->provenances.timestamp_length, true); if (ret != 0) { goto out; } ret = check_offsets(self->provenances.num_rows, self->provenances.record_offset, self->provenances.record_length, true); if (ret != 0) { goto out; } ret = 0; out: return ret; } static int tsk_table_collection_check_node_integrity( const tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_size_t j; double node_time; tsk_id_t population, individual; tsk_id_t num_populations = (tsk_id_t) self->populations.num_rows; tsk_id_t num_individuals = (tsk_id_t) self->individuals.num_rows; const bool check_population_refs = !(options & TSK_NO_CHECK_POPULATION_REFS); for (j = 0; j < self->nodes.num_rows; j++) { node_time = self->nodes.time[j]; if (!tsk_isfinite(node_time)) { ret = tsk_trace_error(TSK_ERR_TIME_NONFINITE); goto out; } if (check_population_refs) { population = self->nodes.population[j]; if (population < TSK_NULL || population >= num_populations) { ret = tsk_trace_error(TSK_ERR_POPULATION_OUT_OF_BOUNDS); goto out; } } individual = self->nodes.individual[j]; if (individual < TSK_NULL || individual >= num_individuals) { ret = tsk_trace_error(TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); goto out; } } out: return ret; } static int tsk_table_collection_check_edge_integrity( const tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_size_t j; tsk_id_t parent, last_parent, child, last_child; double left, last_left, right; const double *time = self->nodes.time; const double L = self->sequence_length; const tsk_edge_table_t edges = self->edges; const tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; const bool check_ordering = !!(options & TSK_CHECK_EDGE_ORDERING); bool *parent_seen = NULL; if (check_ordering) { parent_seen = tsk_calloc((tsk_size_t) num_nodes, sizeof(*parent_seen)); if (parent_seen == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } } /* Just keeping compiler happy; these values don't matter. */ last_left = 0; last_parent = 0; last_child = 0; for (j = 0; j < edges.num_rows; j++) { parent = edges.parent[j]; child = edges.child[j]; left = edges.left[j]; right = edges.right[j]; /* Node ID integrity */ if (parent == TSK_NULL) { ret = tsk_trace_error(TSK_ERR_NULL_PARENT); goto out; } if (parent < 0 || parent >= num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (child == TSK_NULL) { ret = tsk_trace_error(TSK_ERR_NULL_CHILD); goto out; } if (child < 0 || child >= num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } /* Spatial requirements for edges */ if (!(tsk_isfinite(left) && tsk_isfinite(right))) { ret = tsk_trace_error(TSK_ERR_GENOME_COORDS_NONFINITE); goto out; } if (left < 0) { ret = tsk_trace_error(TSK_ERR_LEFT_LESS_ZERO); goto out; } if (right > L) { ret = tsk_trace_error(TSK_ERR_RIGHT_GREATER_SEQ_LENGTH); goto out; } if (left >= right) { ret = tsk_trace_error(TSK_ERR_BAD_EDGE_INTERVAL); goto out; } /* time[child] must be < time[parent] */ if (time[child] >= time[parent]) { ret = tsk_trace_error(TSK_ERR_BAD_NODE_TIME_ORDERING); goto out; } if (check_ordering) { if (parent_seen[parent]) { ret = tsk_trace_error(TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS); goto out; } if (j > 0) { /* Input data must sorted by (time[parent], parent, child, left). */ if (time[parent] < time[last_parent]) { ret = tsk_trace_error(TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME); goto out; } if (time[parent] == time[last_parent]) { if (parent == last_parent) { if (child < last_child) { ret = tsk_trace_error(TSK_ERR_EDGES_NOT_SORTED_CHILD); goto out; } if (child == last_child) { if (left == last_left) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_EDGES); goto out; } else if (left < last_left) { ret = tsk_trace_error(TSK_ERR_EDGES_NOT_SORTED_LEFT); goto out; } } } else { parent_seen[last_parent] = true; } } } last_parent = parent; last_child = child; last_left = left; } } out: tsk_safe_free(parent_seen); return ret; } static int TSK_WARN_UNUSED tsk_table_collection_check_site_integrity( const tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_size_t j; double position; const double L = self->sequence_length; const tsk_site_table_t sites = self->sites; const bool check_site_ordering = !!(options & TSK_CHECK_SITE_ORDERING); const bool check_site_duplicates = !!(options & TSK_CHECK_SITE_DUPLICATES); for (j = 0; j < sites.num_rows; j++) { position = sites.position[j]; /* Spatial requirements */ if (!tsk_isfinite(position)) { ret = tsk_trace_error(TSK_ERR_BAD_SITE_POSITION); goto out; } if (position < 0 || position >= L) { ret = tsk_trace_error(TSK_ERR_BAD_SITE_POSITION); goto out; } if (j > 0) { if (check_site_duplicates && sites.position[j - 1] == position) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SITE_POSITION); goto out; } if (check_site_ordering && sites.position[j - 1] > position) { ret = tsk_trace_error(TSK_ERR_UNSORTED_SITES); goto out; } } } out: return ret; } static int TSK_WARN_UNUSED tsk_table_collection_check_mutation_integrity( const tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_size_t j; tsk_id_t parent_mut; double mutation_time; double last_known_time = INFINITY; const tsk_mutation_table_t mutations = self->mutations; const tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; const tsk_id_t num_sites = (tsk_id_t) self->sites.num_rows; const tsk_id_t num_mutations = (tsk_id_t) self->mutations.num_rows; const double *node_time = self->nodes.time; const bool check_mutation_ordering = !!(options & TSK_CHECK_MUTATION_ORDERING); bool unknown_time; int num_known_times = 0; int num_unknown_times = 0; for (j = 0; j < mutations.num_rows; j++) { /* Basic reference integrity */ if (mutations.site[j] < 0 || mutations.site[j] >= num_sites) { ret = tsk_trace_error(TSK_ERR_SITE_OUT_OF_BOUNDS); goto out; } if (mutations.node[j] < 0 || mutations.node[j] >= num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } /* Integrity check for mutation parent */ parent_mut = mutations.parent[j]; if (parent_mut < TSK_NULL || parent_mut >= num_mutations) { ret = tsk_trace_error(TSK_ERR_MUTATION_OUT_OF_BOUNDS); goto out; } if (parent_mut == (tsk_id_t) j) { ret = tsk_trace_error(TSK_ERR_MUTATION_PARENT_EQUAL); goto out; } /* Check that time is finite and not more recent than node time */ mutation_time = mutations.time[j]; unknown_time = tsk_is_unknown_time(mutation_time); if (!unknown_time) { if (!tsk_isfinite(mutation_time)) { ret = tsk_trace_error(TSK_ERR_TIME_NONFINITE); goto out; } if (mutation_time < node_time[mutations.node[j]]) { ret = tsk_trace_error(TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE); goto out; } } /* reset checks when reaching a new site */ if (j > 0 && mutations.site[j - 1] != mutations.site[j]) { last_known_time = INFINITY; num_known_times = 0; num_unknown_times = 0; } /* Check known/unknown times are not both present on a site */ if (unknown_time) { num_unknown_times++; } else { num_known_times++; } if ((num_unknown_times > 0) && (num_known_times > 0)) { ret = tsk_trace_error(TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN); goto out; } /* check parent site agrees */ if (parent_mut != TSK_NULL) { if (mutations.site[parent_mut] != mutations.site[j]) { ret = tsk_trace_error(TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE); goto out; } /* If this mutation time is known, then the parent time * must also be, or else the * TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN check * above will fail. */ if (!unknown_time && mutation_time > mutations.time[parent_mut]) { ret = tsk_trace_error(TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION); goto out; } } if (check_mutation_ordering) { /* Check site ordering */ if (j > 0 && mutations.site[j - 1] > mutations.site[j]) { ret = tsk_trace_error(TSK_ERR_UNSORTED_MUTATIONS); goto out; } /* Check if parents are listed before their children */ if (parent_mut != TSK_NULL && parent_mut > (tsk_id_t) j) { ret = tsk_trace_error(TSK_ERR_MUTATION_PARENT_AFTER_CHILD); goto out; } /* Check time ordering. We do this after the other checks above, * so that more specific errors trigger first */ if (!unknown_time) { if (mutation_time > last_known_time) { ret = tsk_trace_error(TSK_ERR_UNSORTED_MUTATIONS); goto out; } last_known_time = mutation_time; } } } out: return ret; } static int TSK_WARN_UNUSED tsk_table_collection_check_migration_integrity( const tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_size_t j; double left, right, time; const double L = self->sequence_length; const tsk_migration_table_t migrations = self->migrations; const tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; const tsk_id_t num_populations = (tsk_id_t) self->populations.num_rows; const bool check_population_refs = !(options & TSK_NO_CHECK_POPULATION_REFS); const bool check_migration_ordering = !!(options & TSK_CHECK_MIGRATION_ORDERING); for (j = 0; j < migrations.num_rows; j++) { if (migrations.node[j] < 0 || migrations.node[j] >= num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (check_population_refs) { if (migrations.source[j] < 0 || migrations.source[j] >= num_populations) { ret = tsk_trace_error(TSK_ERR_POPULATION_OUT_OF_BOUNDS); goto out; } if (migrations.dest[j] < 0 || migrations.dest[j] >= num_populations) { ret = tsk_trace_error(TSK_ERR_POPULATION_OUT_OF_BOUNDS); goto out; } } time = migrations.time[j]; if (!tsk_isfinite(time)) { ret = tsk_trace_error(TSK_ERR_TIME_NONFINITE); goto out; } if (j > 0) { if (check_migration_ordering && migrations.time[j - 1] > time) { ret = tsk_trace_error(TSK_ERR_UNSORTED_MIGRATIONS); goto out; } } left = migrations.left[j]; right = migrations.right[j]; /* Spatial requirements */ /* TODO it's a bit misleading to use the edge-specific errors here. */ if (!(tsk_isfinite(left) && tsk_isfinite(right))) { ret = tsk_trace_error(TSK_ERR_GENOME_COORDS_NONFINITE); goto out; } if (left < 0) { ret = tsk_trace_error(TSK_ERR_LEFT_LESS_ZERO); goto out; } if (right > L) { ret = tsk_trace_error(TSK_ERR_RIGHT_GREATER_SEQ_LENGTH); goto out; } if (left >= right) { ret = tsk_trace_error(TSK_ERR_BAD_EDGE_INTERVAL); goto out; } } out: return ret; } static int TSK_WARN_UNUSED tsk_table_collection_check_individual_integrity( const tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_size_t j, k; const tsk_individual_table_t individuals = self->individuals; const tsk_id_t num_individuals = (tsk_id_t) individuals.num_rows; const bool check_individual_ordering = options & TSK_CHECK_INDIVIDUAL_ORDERING; for (j = 0; j < (tsk_size_t) num_individuals; j++) { for (k = individuals.parents_offset[j]; k < individuals.parents_offset[j + 1]; k++) { /* Check parent references are valid */ if (individuals.parents[k] != TSK_NULL && (individuals.parents[k] < 0 || individuals.parents[k] >= num_individuals)) { ret = tsk_trace_error(TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); goto out; } /* Check no-one is their own parent */ if (individuals.parents[k] == (tsk_id_t) j) { ret = tsk_trace_error(TSK_ERR_INDIVIDUAL_SELF_PARENT); goto out; } /* Check parents are ordered */ if (check_individual_ordering && individuals.parents[k] != TSK_NULL && individuals.parents[k] >= (tsk_id_t) j) { ret = tsk_trace_error(TSK_ERR_UNSORTED_INDIVIDUALS); goto out; } } } out: return ret; } static tsk_id_t TSK_WARN_UNUSED tsk_table_collection_check_tree_integrity(const tsk_table_collection_t *self) { tsk_id_t ret = 0; tsk_size_t j, k; tsk_id_t e, u, site, mutation; double tree_left, tree_right; const double sequence_length = self->sequence_length; const tsk_id_t num_sites = (tsk_id_t) self->sites.num_rows; const tsk_id_t num_mutations = (tsk_id_t) self->mutations.num_rows; const tsk_size_t num_edges = self->edges.num_rows; const double *restrict site_position = self->sites.position; const tsk_id_t *restrict mutation_site = self->mutations.site; const tsk_id_t *restrict mutation_node = self->mutations.node; const double *restrict mutation_time = self->mutations.time; const double *restrict node_time = self->nodes.time; const tsk_id_t *restrict I = self->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->indexes.edge_removal_order; const double *restrict edge_right = self->edges.right; const double *restrict edge_left = self->edges.left; const tsk_id_t *restrict edge_child = self->edges.child; const tsk_id_t *restrict edge_parent = self->edges.parent; tsk_id_t *restrict parent = NULL; int8_t *restrict used_edges = NULL; tsk_id_t num_trees = 0; parent = tsk_malloc(self->nodes.num_rows * sizeof(*parent)); used_edges = tsk_malloc(num_edges * sizeof(*used_edges)); if (parent == NULL || used_edges == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(parent, 0xff, self->nodes.num_rows * sizeof(*parent)); tsk_memset(used_edges, 0, num_edges * sizeof(*used_edges)); tree_left = 0; num_trees = 0; j = 0; k = 0; site = 0; mutation = 0; tsk_bug_assert(I != NULL && O != NULL); tsk_bug_assert(self->indexes.num_edges == num_edges); while (j < num_edges || tree_left < sequence_length) { while (k < num_edges && edge_right[O[k]] == tree_left) { e = O[k]; if (used_edges[e] != 1) { ret = tsk_trace_error(TSK_ERR_TABLES_BAD_INDEXES); goto out; } parent[edge_child[e]] = TSK_NULL; used_edges[e]++; k++; } while (j < num_edges && edge_left[I[j]] == tree_left) { e = I[j]; if (used_edges[e] != 0) { ret = tsk_trace_error(TSK_ERR_TABLES_BAD_INDEXES); goto out; } used_edges[e]++; u = edge_child[e]; if (parent[u] != TSK_NULL) { ret = tsk_trace_error(TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN); goto out; } parent[u] = edge_parent[e]; j++; } tree_right = sequence_length; if (j < num_edges) { tree_right = TSK_MIN(tree_right, edge_left[I[j]]); } if (k < num_edges) { tree_right = TSK_MIN(tree_right, edge_right[O[k]]); } while (site < num_sites && site_position[site] < tree_right) { while (mutation < num_mutations && mutation_site[mutation] == site) { if (!tsk_is_unknown_time(mutation_time[mutation]) && parent[mutation_node[mutation]] != TSK_NULL && node_time[parent[mutation_node[mutation]]] <= mutation_time[mutation]) { ret = tsk_trace_error(TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE); goto out; } mutation++; } site++; } if (tree_right <= tree_left) { ret = tsk_trace_error(TSK_ERR_TABLES_BAD_INDEXES); goto out; } tree_left = tree_right; /* This is technically possible; if we have 2**31 edges each defining * a single tree, and there's a gap between each of these edges we * would overflow this counter. */ if (num_trees == TSK_MAX_ID) { ret = tsk_trace_error(TSK_ERR_TREE_OVERFLOW); goto out; } num_trees++; } tsk_bug_assert(j == num_edges); while (k < num_edges) { /* At this point it must be that used_edges[O[k]] == 1, * since otherwise we would have added a different edge twice, * and so hit the error above. */ e = O[k]; if (edge_right[e] != sequence_length) { ret = tsk_trace_error(TSK_ERR_TABLES_BAD_INDEXES); goto out; } used_edges[e]++; k++; } ret = num_trees; out: /* Can't use tsk_safe_free because of restrict*/ if (parent != NULL) { free(parent); } if (used_edges != NULL) { free(used_edges); } return ret; } static int TSK_WARN_UNUSED tsk_table_collection_check_index_integrity(const tsk_table_collection_t *self) { int ret = 0; tsk_id_t j; const tsk_id_t num_edges = (tsk_id_t) self->edges.num_rows; const tsk_id_t *edge_insertion_order = self->indexes.edge_insertion_order; const tsk_id_t *edge_removal_order = self->indexes.edge_removal_order; if (!tsk_table_collection_has_index(self, 0)) { ret = tsk_trace_error(TSK_ERR_TABLES_NOT_INDEXED); goto out; } for (j = 0; j < num_edges; j++) { if (edge_insertion_order[j] < 0 || edge_insertion_order[j] >= num_edges) { ret = tsk_trace_error(TSK_ERR_EDGE_OUT_OF_BOUNDS); goto out; } if (edge_removal_order[j] < 0 || edge_removal_order[j] >= num_edges) { ret = tsk_trace_error(TSK_ERR_EDGE_OUT_OF_BOUNDS); goto out; } } out: return ret; } static int TSK_WARN_UNUSED tsk_table_collection_compute_mutation_parents_to_array( const tsk_table_collection_t *self, tsk_id_t *mutation_parent) { int ret = 0; const tsk_id_t *I, *O; const tsk_edge_table_t edges = self->edges; const tsk_node_table_t nodes = self->nodes; const tsk_site_table_t sites = self->sites; const tsk_mutation_table_t mutations = self->mutations; const tsk_id_t M = (tsk_id_t) edges.num_rows; tsk_id_t tj, tk; tsk_id_t *parent = NULL; tsk_id_t *bottom_mutation = NULL; tsk_id_t u; double left, right; tsk_id_t site; /* Using unsigned values here avoids potentially undefined behaviour */ tsk_size_t j, mutation, first_mutation; parent = tsk_malloc(nodes.num_rows * sizeof(*parent)); bottom_mutation = tsk_malloc(nodes.num_rows * sizeof(*bottom_mutation)); if (parent == NULL || bottom_mutation == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(parent, 0xff, nodes.num_rows * sizeof(*parent)); tsk_memset(bottom_mutation, 0xff, nodes.num_rows * sizeof(*bottom_mutation)); tsk_memset(mutation_parent, 0xff, self->mutations.num_rows * sizeof(tsk_id_t)); I = self->indexes.edge_insertion_order; O = self->indexes.edge_removal_order; tj = 0; tk = 0; site = 0; mutation = 0; left = 0; while (tj < M || left < self->sequence_length) { while (tk < M && edges.right[O[tk]] == left) { parent[edges.child[O[tk]]] = TSK_NULL; tk++; } while (tj < M && edges.left[I[tj]] == left) { parent[edges.child[I[tj]]] = edges.parent[I[tj]]; tj++; } right = self->sequence_length; if (tj < M) { right = TSK_MIN(right, edges.left[I[tj]]); } if (tk < M) { right = TSK_MIN(right, edges.right[O[tk]]); } /* Tree is now ready. We look at each site on this tree in turn */ while (site < (tsk_id_t) sites.num_rows && sites.position[site] < right) { /* Create a mapping from mutations to nodes. If we see more than one * mutation at a node, the previously seen one must be the parent * of the current since we assume they are in order. */ first_mutation = mutation; while (mutation < mutations.num_rows && mutations.site[mutation] == site) { u = mutations.node[mutation]; if (bottom_mutation[u] != TSK_NULL) { mutation_parent[mutation] = bottom_mutation[u]; } bottom_mutation[u] = (tsk_id_t) mutation; mutation++; } /* Make the common case of 1 mutation fast */ if (mutation > first_mutation + 1) { /* If we have more than one mutation, compute the parent for each * one by traversing up the tree until we find a node that has a * mutation. */ for (j = first_mutation; j < mutation; j++) { if (mutation_parent[j] == TSK_NULL) { u = parent[mutations.node[j]]; while (u != TSK_NULL && bottom_mutation[u] == TSK_NULL) { u = parent[u]; } if (u != TSK_NULL) { mutation_parent[j] = bottom_mutation[u]; } } } } /* Reset the mapping for the next site */ for (j = first_mutation; j < mutation; j++) { u = mutations.node[j]; bottom_mutation[u] = TSK_NULL; /* Check that we haven't violated the sortedness property */ if (mutation_parent[j] > (tsk_id_t) j) { ret = tsk_trace_error(TSK_ERR_MUTATION_PARENT_AFTER_CHILD); goto out; } } site++; } /* Move on to the next tree */ left = right; } out: tsk_safe_free(parent); tsk_safe_free(bottom_mutation); return ret; } static int TSK_WARN_UNUSED tsk_table_collection_check_mutation_parents(const tsk_table_collection_t *self) { int ret = 0; tsk_mutation_table_t mutations = self->mutations; tsk_id_t *new_parents = NULL; tsk_size_t j; if (mutations.num_rows == 0) { return ret; } new_parents = tsk_malloc(mutations.num_rows * sizeof(*new_parents)); if (new_parents == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_table_collection_compute_mutation_parents_to_array(self, new_parents); if (ret != 0) { goto out; } for (j = 0; j < mutations.num_rows; j++) { if (mutations.parent[j] != new_parents[j]) { ret = tsk_trace_error(TSK_ERR_BAD_MUTATION_PARENT); goto out; } } out: tsk_safe_free(new_parents); return ret; } tsk_id_t TSK_WARN_UNUSED tsk_table_collection_check_integrity( const tsk_table_collection_t *self, tsk_flags_t options) { tsk_id_t ret = 0; int mut_ret = 0; if (options & TSK_CHECK_MUTATION_PARENTS) { /* If we're checking mutation parents, we need to check the trees first */ options |= TSK_CHECK_TREES; } if (options & TSK_CHECK_TREES) { /* Checking the trees implies these checks */ options |= TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING | TSK_CHECK_SITE_DUPLICATES | TSK_CHECK_MUTATION_ORDERING | TSK_CHECK_MIGRATION_ORDERING | TSK_CHECK_INDEXES; } if (!tsk_isfinite(self->sequence_length) || self->sequence_length <= 0) { ret = tsk_trace_error(TSK_ERR_BAD_SEQUENCE_LENGTH); goto out; } ret = tsk_table_collection_check_offsets(self); if (ret != 0) { goto out; } ret = tsk_table_collection_check_node_integrity(self, options); if (ret != 0) { goto out; } ret = tsk_table_collection_check_edge_integrity(self, options); if (ret != 0) { goto out; } ret = tsk_table_collection_check_site_integrity(self, options); if (ret != 0) { goto out; } ret = tsk_table_collection_check_mutation_integrity(self, options); if (ret != 0) { goto out; } ret = tsk_table_collection_check_migration_integrity(self, options); if (ret != 0) { goto out; } ret = tsk_table_collection_check_individual_integrity(self, options); if (ret != 0) { goto out; } if (options & TSK_CHECK_INDEXES) { ret = tsk_table_collection_check_index_integrity(self); if (ret != 0) { goto out; } } if (options & TSK_CHECK_TREES) { ret = tsk_table_collection_check_tree_integrity(self); if (ret < 0) { goto out; } /* This check requires tree integrity so do it last */ if (options & TSK_CHECK_MUTATION_PARENTS) { mut_ret = tsk_table_collection_check_mutation_parents(self); if (mut_ret != 0) { ret = mut_ret; goto out; } } } out: return ret; } void tsk_table_collection_print_state(const tsk_table_collection_t *self, FILE *out) { fprintf(out, "Table collection state\n"); fprintf(out, "sequence_length = %f\n", self->sequence_length); write_metadata_schema_header( out, self->metadata_schema, self->metadata_schema_length); fprintf(out, "#metadata#\n"); fprintf(out, "%.*s\n", (int) self->metadata_length, self->metadata); fprintf(out, "#end#metadata\n"); fprintf(out, "#time_units#\n"); fprintf(out, "%.*s\n", (int) self->time_units_length, self->time_units); fprintf(out, "#end#time_units\n"); tsk_individual_table_print_state(&self->individuals, out); tsk_node_table_print_state(&self->nodes, out); tsk_edge_table_print_state(&self->edges, out); tsk_migration_table_print_state(&self->migrations, out); tsk_site_table_print_state(&self->sites, out); tsk_mutation_table_print_state(&self->mutations, out); tsk_population_table_print_state(&self->populations, out); tsk_provenance_table_print_state(&self->provenances, out); } int TSK_WARN_UNUSED tsk_table_collection_init(tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_flags_t edge_options = 0; tsk_memset(self, 0, sizeof(*self)); if (options & TSK_TC_NO_EDGE_METADATA) { edge_options |= TSK_TABLE_NO_METADATA; } /* Set default time_units value */ ret = tsk_table_collection_set_time_units( self, TSK_TIME_UNITS_UNKNOWN, strlen(TSK_TIME_UNITS_UNKNOWN)); if (ret != 0) { goto out; } ret = tsk_node_table_init(&self->nodes, 0); if (ret != 0) { goto out; } ret = tsk_edge_table_init(&self->edges, edge_options); if (ret != 0) { goto out; } ret = tsk_migration_table_init(&self->migrations, 0); if (ret != 0) { goto out; } ret = tsk_site_table_init(&self->sites, 0); if (ret != 0) { goto out; } ret = tsk_mutation_table_init(&self->mutations, 0); if (ret != 0) { goto out; } ret = tsk_individual_table_init(&self->individuals, 0); if (ret != 0) { goto out; } ret = tsk_population_table_init(&self->populations, 0); if (ret != 0) { goto out; } ret = tsk_provenance_table_init(&self->provenances, 0); if (ret != 0) { goto out; } ret = tsk_reference_sequence_init(&self->reference_sequence, 0); if (ret != 0) { goto out; } out: return ret; } int tsk_table_collection_free(tsk_table_collection_t *self) { tsk_individual_table_free(&self->individuals); tsk_node_table_free(&self->nodes); tsk_edge_table_free(&self->edges); tsk_migration_table_free(&self->migrations); tsk_site_table_free(&self->sites); tsk_mutation_table_free(&self->mutations); tsk_population_table_free(&self->populations); tsk_provenance_table_free(&self->provenances); tsk_reference_sequence_free(&self->reference_sequence); tsk_safe_free(self->indexes.edge_insertion_order); tsk_safe_free(self->indexes.edge_removal_order); tsk_safe_free(self->file_uuid); tsk_safe_free(self->time_units); tsk_safe_free(self->metadata); tsk_safe_free(self->metadata_schema); return 0; } bool tsk_table_collection_equals(const tsk_table_collection_t *self, const tsk_table_collection_t *other, tsk_flags_t options) { bool ret = self->sequence_length == other->sequence_length && self->time_units_length == other->time_units_length && tsk_memcmp(self->time_units, other->time_units, self->time_units_length * sizeof(char)) == 0; if (!(options & TSK_CMP_IGNORE_TABLES)) { ret = ret && tsk_individual_table_equals( &self->individuals, &other->individuals, options) && tsk_node_table_equals(&self->nodes, &other->nodes, options) && tsk_edge_table_equals(&self->edges, &other->edges, options) && tsk_migration_table_equals( &self->migrations, &other->migrations, options) && tsk_site_table_equals(&self->sites, &other->sites, options) && tsk_mutation_table_equals(&self->mutations, &other->mutations, options) && tsk_population_table_equals( &self->populations, &other->populations, options); /* TSK_CMP_IGNORE_TABLES implies TSK_CMP_IGNORE_PROVENANCE */ if (!(options & TSK_CMP_IGNORE_PROVENANCE)) { ret = ret && tsk_provenance_table_equals( &self->provenances, &other->provenances, options); } } /* TSK_CMP_IGNORE_TS_METADATA is implied by TSK_CMP_IGNORE_METADATA */ if (options & TSK_CMP_IGNORE_METADATA) { options |= TSK_CMP_IGNORE_TS_METADATA; } if (!(options & TSK_CMP_IGNORE_TS_METADATA)) { ret = ret && (self->metadata_length == other->metadata_length && self->metadata_schema_length == other->metadata_schema_length && tsk_memcmp(self->metadata, other->metadata, self->metadata_length * sizeof(char)) == 0 && tsk_memcmp(self->metadata_schema, other->metadata_schema, self->metadata_schema_length * sizeof(char)) == 0); } if (!(options & TSK_CMP_IGNORE_REFERENCE_SEQUENCE)) { ret = ret && tsk_reference_sequence_equals( &self->reference_sequence, &other->reference_sequence, options); } return ret; } int tsk_table_collection_set_time_units( tsk_table_collection_t *self, const char *time_units, tsk_size_t time_units_length) { return replace_string( &self->time_units, &self->time_units_length, time_units, time_units_length); } int tsk_table_collection_set_metadata( tsk_table_collection_t *self, const char *metadata, tsk_size_t metadata_length) { return replace_string( &self->metadata, &self->metadata_length, metadata, metadata_length); } int tsk_table_collection_takeset_metadata( tsk_table_collection_t *self, char *metadata, tsk_size_t metadata_length) { return takeset_string( &self->metadata, &self->metadata_length, metadata, metadata_length); } int tsk_table_collection_set_metadata_schema(tsk_table_collection_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length) { return replace_string(&self->metadata_schema, &self->metadata_schema_length, metadata_schema, metadata_schema_length); } int tsk_table_collection_set_indexes(tsk_table_collection_t *self, tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order) { int ret = 0; tsk_size_t index_size = self->edges.num_rows * sizeof(tsk_id_t); tsk_table_collection_drop_index(self, 0); self->indexes.edge_insertion_order = tsk_malloc(index_size); self->indexes.edge_removal_order = tsk_malloc(index_size); if (self->indexes.edge_insertion_order == NULL || self->indexes.edge_removal_order == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memcpy(self->indexes.edge_insertion_order, edge_insertion_order, index_size); tsk_memcpy(self->indexes.edge_removal_order, edge_removal_order, index_size); self->indexes.num_edges = self->edges.num_rows; out: return ret; } int tsk_table_collection_takeset_indexes(tsk_table_collection_t *self, tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order) { int ret = 0; if (edge_insertion_order == NULL || edge_removal_order == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } tsk_table_collection_drop_index(self, 0); self->indexes.edge_insertion_order = edge_insertion_order; self->indexes.edge_removal_order = edge_removal_order; self->indexes.num_edges = self->edges.num_rows; out: return ret; } bool tsk_table_collection_has_index( const tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) { return self->indexes.edge_insertion_order != NULL && self->indexes.edge_removal_order != NULL && self->indexes.num_edges == self->edges.num_rows; } bool tsk_table_collection_has_reference_sequence(const tsk_table_collection_t *self) { return !tsk_reference_sequence_is_null(&self->reference_sequence); } int tsk_table_collection_drop_index( tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) { tsk_safe_free(self->indexes.edge_insertion_order); tsk_safe_free(self->indexes.edge_removal_order); self->indexes.edge_insertion_order = NULL; self->indexes.edge_removal_order = NULL; self->indexes.num_edges = 0; return 0; } int TSK_WARN_UNUSED tsk_table_collection_build_index( tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = TSK_ERR_GENERIC; tsk_id_t ret_id; tsk_size_t j; double *time = self->nodes.time; index_sort_t *sort_buff = NULL; tsk_id_t parent; /* For build indexes to make sense we must have referential integrity and * sorted edges */ ret_id = tsk_table_collection_check_integrity(self, TSK_CHECK_EDGE_ORDERING); if (ret_id != 0) { ret = (int) ret_id; goto out; } tsk_table_collection_drop_index(self, 0); self->indexes.edge_insertion_order = tsk_malloc(self->edges.num_rows * sizeof(tsk_id_t)); self->indexes.edge_removal_order = tsk_malloc(self->edges.num_rows * sizeof(tsk_id_t)); sort_buff = tsk_malloc(self->edges.num_rows * sizeof(index_sort_t)); if (self->indexes.edge_insertion_order == NULL || self->indexes.edge_removal_order == NULL || sort_buff == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } /* sort by left and increasing time to give us the order in which * records should be inserted */ for (j = 0; j < self->edges.num_rows; j++) { sort_buff[j].index = (tsk_id_t) j; sort_buff[j].first = self->edges.left[j]; parent = self->edges.parent[j]; sort_buff[j].second = time[parent]; sort_buff[j].third = parent; sort_buff[j].fourth = self->edges.child[j]; } qsort( sort_buff, (size_t) self->edges.num_rows, sizeof(index_sort_t), cmp_index_sort); for (j = 0; j < self->edges.num_rows; j++) { self->indexes.edge_insertion_order[j] = sort_buff[j].index; } /* sort by right and decreasing parent time to give us the order in which * records should be removed. */ for (j = 0; j < self->edges.num_rows; j++) { sort_buff[j].index = (tsk_id_t) j; sort_buff[j].first = self->edges.right[j]; parent = self->edges.parent[j]; sort_buff[j].second = -time[parent]; sort_buff[j].third = -parent; sort_buff[j].fourth = -self->edges.child[j]; } qsort( sort_buff, (size_t) self->edges.num_rows, sizeof(index_sort_t), cmp_index_sort); for (j = 0; j < self->edges.num_rows; j++) { self->indexes.edge_removal_order[j] = sort_buff[j].index; } self->indexes.num_edges = self->edges.num_rows; ret = 0; out: tsk_safe_free(sort_buff); return ret; } static int TSK_WARN_UNUSED tsk_table_collection_set_file_uuid(tsk_table_collection_t *self, const char *uuid) { int ret = 0; tsk_safe_free(self->file_uuid); self->file_uuid = NULL; if (uuid != NULL) { /* Allow space for \0 so we can print it as a string */ self->file_uuid = tsk_malloc(TSK_UUID_SIZE + 1); if (self->file_uuid == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memcpy(self->file_uuid, uuid, TSK_UUID_SIZE); self->file_uuid[TSK_UUID_SIZE] = '\0'; } out: return ret; } int TSK_WARN_UNUSED tsk_table_collection_copy(const tsk_table_collection_t *self, tsk_table_collection_t *dest, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_table_collection_init(dest, options); if (ret != 0) { goto out; } } ret = tsk_node_table_copy(&self->nodes, &dest->nodes, TSK_NO_INIT); if (ret != 0) { goto out; } ret = tsk_edge_table_copy(&self->edges, &dest->edges, TSK_NO_INIT); if (ret != 0) { goto out; } ret = tsk_migration_table_copy(&self->migrations, &dest->migrations, TSK_NO_INIT); if (ret != 0) { goto out; } ret = tsk_site_table_copy(&self->sites, &dest->sites, TSK_NO_INIT); if (ret != 0) { goto out; } ret = tsk_mutation_table_copy(&self->mutations, &dest->mutations, TSK_NO_INIT); if (ret != 0) { goto out; } ret = tsk_individual_table_copy(&self->individuals, &dest->individuals, TSK_NO_INIT); if (ret != 0) { goto out; } ret = tsk_population_table_copy(&self->populations, &dest->populations, TSK_NO_INIT); if (ret != 0) { goto out; } ret = tsk_provenance_table_copy(&self->provenances, &dest->provenances, TSK_NO_INIT); if (ret != 0) { goto out; } dest->sequence_length = self->sequence_length; if (tsk_table_collection_has_index(self, 0)) { ret = tsk_table_collection_set_indexes( dest, self->indexes.edge_insertion_order, self->indexes.edge_removal_order); if (ret != 0) { goto out; } } ret = tsk_table_collection_set_time_units( dest, self->time_units, self->time_units_length); if (ret != 0) { goto out; } ret = tsk_table_collection_set_metadata(dest, self->metadata, self->metadata_length); if (ret != 0) { goto out; } ret = tsk_table_collection_set_metadata_schema( dest, self->metadata_schema, self->metadata_schema_length); if (ret != 0) { goto out; } ret = tsk_reference_sequence_copy( &self->reference_sequence, &dest->reference_sequence, options); if (ret != 0) { goto out; } if (options & TSK_COPY_FILE_UUID) { /* The UUID should only be generated on writing to a file (see the call * to generate_uuid in tsk_table_collection_write_format_data) and * no other writing access is supported. We only read the value from * the file, and raise an error if it's the wrong length there. Thus, * finding a UUID value of any other length here is undefined behaviour. */ tsk_bug_assert( self->file_uuid == NULL || strlen(self->file_uuid) == TSK_UUID_SIZE); ret = tsk_table_collection_set_file_uuid(dest, self->file_uuid); if (ret != 0) { goto out; } } out: return ret; } static int TSK_WARN_UNUSED tsk_table_collection_read_format_data(tsk_table_collection_t *self, kastore_t *store) { int ret = 0; size_t len; uint32_t *version = NULL; int8_t *format_name = NULL; int8_t *uuid = NULL; double *L = NULL; char *time_units = NULL; char *metadata = NULL; char *metadata_schema = NULL; size_t time_units_length, metadata_length, metadata_schema_length; /* TODO we could simplify this function quite a bit if we use the * read_table_properties infrastructure. We would need to add the * ability to have non-optional columns to that though. */ ret = kastore_gets_int8(store, "format/name", &format_name, &len); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } if (len != TSK_FILE_FORMAT_NAME_LENGTH) { ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); goto out; } if (tsk_memcmp(TSK_FILE_FORMAT_NAME, format_name, TSK_FILE_FORMAT_NAME_LENGTH) != 0) { ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); goto out; } ret = kastore_gets_uint32(store, "format/version", &version, &len); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } if (len != 2) { ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); goto out; } if (version[0] < TSK_FILE_FORMAT_VERSION_MAJOR) { ret = tsk_trace_error(TSK_ERR_FILE_VERSION_TOO_OLD); goto out; } if (version[0] > TSK_FILE_FORMAT_VERSION_MAJOR) { ret = tsk_trace_error(TSK_ERR_FILE_VERSION_TOO_NEW); goto out; } ret = kastore_gets_float64(store, "sequence_length", &L, &len); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } if (len != 1) { ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); goto out; } if (L[0] <= 0.0) { ret = tsk_trace_error(TSK_ERR_BAD_SEQUENCE_LENGTH); goto out; } self->sequence_length = L[0]; ret = kastore_gets_int8(store, "uuid", &uuid, &len); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } if (len != TSK_UUID_SIZE) { ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); goto out; } ret = tsk_table_collection_set_file_uuid(self, (const char *) uuid); if (ret != 0) { goto out; } ret = kastore_containss(store, "time_units"); if (ret < 0) { ret = tsk_set_kas_error(ret); goto out; } if (ret == 1) { ret = kastore_gets_int8( store, "time_units", (int8_t **) &time_units, &time_units_length); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } ret = tsk_table_collection_set_time_units( self, time_units, (tsk_size_t) time_units_length); if (ret != 0) { goto out; } } ret = kastore_containss(store, "metadata"); if (ret < 0) { ret = tsk_set_kas_error(ret); goto out; } if (ret == 1) { ret = kastore_gets_int8( store, "metadata", (int8_t **) &metadata, &metadata_length); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } ret = tsk_table_collection_takeset_metadata( self, metadata, (tsk_size_t) metadata_length); if (ret != 0) { goto out; } metadata = NULL; } ret = kastore_containss(store, "metadata_schema"); if (ret < 0) { ret = tsk_set_kas_error(ret); goto out; } if (ret == 1) { ret = kastore_gets_int8(store, "metadata_schema", (int8_t **) &metadata_schema, (size_t *) &metadata_schema_length); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } ret = tsk_table_collection_set_metadata_schema( self, metadata_schema, (tsk_size_t) metadata_schema_length); if (ret != 0) { goto out; } } out: if ((ret ^ (1 << TSK_KAS_ERR_BIT)) == KAS_ERR_KEY_NOT_FOUND) { ret = tsk_trace_error(TSK_ERR_REQUIRED_COL_NOT_FOUND); } tsk_safe_free(version); tsk_safe_free(format_name); tsk_safe_free(uuid); tsk_safe_free(L); tsk_safe_free(time_units); tsk_safe_free(metadata_schema); tsk_safe_free(metadata); return ret; } static int TSK_WARN_UNUSED tsk_table_collection_dump_indexes(const tsk_table_collection_t *self, kastore_t *store, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; write_table_col_t cols[] = { { "indexes/edge_insertion_order", NULL, self->indexes.num_edges, TSK_ID_STORAGE_TYPE }, { "indexes/edge_removal_order", NULL, self->indexes.num_edges, TSK_ID_STORAGE_TYPE }, { .name = NULL }, }; if (tsk_table_collection_has_index(self, 0)) { cols[0].array = self->indexes.edge_insertion_order; cols[1].array = self->indexes.edge_removal_order; ret = write_table_cols(store, cols, 0); } return ret; } static int TSK_WARN_UNUSED tsk_table_collection_load_indexes(tsk_table_collection_t *self, kastore_t *store) { int ret = 0; tsk_id_t *edge_insertion_order = NULL; tsk_id_t *edge_removal_order = NULL; tsk_size_t num_rows; read_table_col_t cols[] = { { "indexes/edge_insertion_order", (void **) &edge_insertion_order, TSK_ID_STORAGE_TYPE, TSK_COL_OPTIONAL }, { "indexes/edge_removal_order", (void **) &edge_removal_order, TSK_ID_STORAGE_TYPE, TSK_COL_OPTIONAL }, { .name = NULL }, }; num_rows = TSK_NUM_ROWS_UNSET; ret = read_table_cols(store, &num_rows, cols, 0); if (ret != 0) { goto out; } if ((edge_insertion_order == NULL) != (edge_removal_order == NULL)) { ret = tsk_trace_error(TSK_ERR_BOTH_COLUMNS_REQUIRED); goto out; } if (edge_insertion_order != NULL) { if (num_rows != self->edges.num_rows) { ret = tsk_trace_error(TSK_ERR_FILE_FORMAT); goto out; } ret = tsk_table_collection_takeset_indexes( self, edge_insertion_order, edge_removal_order); if (ret != 0) { goto out; } } edge_insertion_order = NULL; edge_removal_order = NULL; out: tsk_safe_free(edge_insertion_order); tsk_safe_free(edge_removal_order); return ret; } static int tsk_table_collection_load_reference_sequence( tsk_table_collection_t *self, kastore_t *store) { int ret = 0; char *data = NULL; char *url = NULL; char *metadata = NULL; char *metadata_schema = NULL; tsk_size_t data_length = 0, url_length, metadata_length, metadata_schema_length; read_table_property_t properties[] = { { "reference_sequence/data", (void **) &data, &data_length, KAS_UINT8, TSK_COL_OPTIONAL }, { "reference_sequence/url", (void **) &url, &url_length, KAS_UINT8, TSK_COL_OPTIONAL }, { "reference_sequence/metadata", (void **) &metadata, &metadata_length, KAS_UINT8, TSK_COL_OPTIONAL }, { "reference_sequence/metadata_schema", (void **) &metadata_schema, &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, { .name = NULL }, }; ret = read_table_properties(store, properties, 0); if (ret != 0) { goto out; } if (data != NULL) { ret = tsk_reference_sequence_takeset_data( &self->reference_sequence, data, (tsk_size_t) data_length); if (ret != 0) { goto out; } data = NULL; } if (metadata != NULL) { ret = tsk_reference_sequence_takeset_metadata( &self->reference_sequence, metadata, (tsk_size_t) metadata_length); if (ret != 0) { goto out; } metadata = NULL; } if (metadata_schema != NULL) { ret = tsk_reference_sequence_set_metadata_schema(&self->reference_sequence, metadata_schema, (tsk_size_t) metadata_schema_length); if (ret != 0) { goto out; } } if (url != NULL) { ret = tsk_reference_sequence_set_url( &self->reference_sequence, url, (tsk_size_t) url_length); if (ret != 0) { goto out; } } out: free_read_table_mem(NULL, NULL, properties); return ret; } static int TSK_WARN_UNUSED tsk_table_collection_loadf_inited( tsk_table_collection_t *self, FILE *file, tsk_flags_t options) { int ret = 0; kastore_t store; int kas_flags = KAS_READ_ALL; if ((options & TSK_LOAD_SKIP_TABLES) || (options & TSK_LOAD_SKIP_REFERENCE_SEQUENCE)) { kas_flags = 0; } kas_flags = kas_flags | KAS_GET_TAKES_OWNERSHIP; ret = kastore_openf(&store, file, "r", kas_flags); if (ret != 0) { if (ret == KAS_ERR_EOF) { /* KAS_ERR_EOF means that we tried to read a store from the stream * and we hit EOF immediately without reading any bytes. We signal * this back to the client, which allows it to read an indefinite * number of stores from a stream */ ret = tsk_trace_error(TSK_ERR_EOF); } else { ret = tsk_set_kas_error(ret); } goto out; } ret = tsk_table_collection_read_format_data(self, &store); if (ret != 0) { goto out; } if (!(options & TSK_LOAD_SKIP_TABLES)) { ret = tsk_node_table_load(&self->nodes, &store); if (ret != 0) { goto out; } ret = tsk_edge_table_load(&self->edges, &store); if (ret != 0) { goto out; } ret = tsk_site_table_load(&self->sites, &store); if (ret != 0) { goto out; } ret = tsk_mutation_table_load(&self->mutations, &store); if (ret != 0) { goto out; } ret = tsk_migration_table_load(&self->migrations, &store); if (ret != 0) { goto out; } ret = tsk_individual_table_load(&self->individuals, &store); if (ret != 0) { goto out; } ret = tsk_population_table_load(&self->populations, &store); if (ret != 0) { goto out; } ret = tsk_provenance_table_load(&self->provenances, &store); if (ret != 0) { goto out; } ret = tsk_table_collection_load_indexes(self, &store); if (ret != 0) { goto out; } } else { ret = tsk_table_collection_build_index(self, 0); if (ret != 0) { goto out; } } if (!(options & TSK_LOAD_SKIP_REFERENCE_SEQUENCE)) { ret = tsk_table_collection_load_reference_sequence(self, &store); if (ret != 0) { goto out; } } ret = kastore_close(&store); if (ret != 0) { goto out; } out: /* If we're exiting on an error, we ignore any further errors that might come * from kastore. In the nominal case, closing an already-closed store is a * safe noop */ kastore_close(&store); return ret; } int TSK_WARN_UNUSED tsk_table_collection_loadf(tsk_table_collection_t *self, FILE *file, tsk_flags_t options) { int ret = 0; if (!(options & TSK_NO_INIT)) { ret = tsk_table_collection_init(self, options); if (ret != 0) { goto out; } } ret = tsk_table_collection_loadf_inited(self, file, options); if (ret != 0) { goto out; } out: return ret; } int TSK_WARN_UNUSED tsk_table_collection_load( tsk_table_collection_t *self, const char *filename, tsk_flags_t options) { int ret = 0; FILE *file = NULL; if (!(options & TSK_NO_INIT)) { ret = tsk_table_collection_init(self, options); if (ret != 0) { goto out; } } file = fopen(filename, "rb"); if (file == NULL) { ret = tsk_trace_error(TSK_ERR_IO); goto out; } ret = tsk_table_collection_loadf_inited(self, file, options); if (ret != 0) { goto out; } if (fclose(file) != 0) { ret = tsk_trace_error(TSK_ERR_IO); goto out; } file = NULL; out: if (file != NULL) { /* Ignore any additional errors we might get when closing the file * in error conditions */ fclose(file); } return ret; } static int TSK_WARN_UNUSED tsk_table_collection_dump_reference_sequence(const tsk_table_collection_t *self, kastore_t *store, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; const tsk_reference_sequence_t *ref = &self->reference_sequence; write_table_col_t write_cols[] = { { "reference_sequence/data", (void *) ref->data, ref->data_length, KAS_UINT8 }, { "reference_sequence/url", (void *) ref->url, ref->url_length, KAS_UINT8 }, { "reference_sequence/metadata", (void *) ref->metadata, ref->metadata_length, KAS_UINT8 }, { "reference_sequence/metadata_schema", (void *) ref->metadata_schema, ref->metadata_schema_length, KAS_UINT8 }, { .name = NULL }, }; if (tsk_table_collection_has_reference_sequence(self)) { ret = write_table_cols(store, write_cols, 0); } return ret; } int TSK_WARN_UNUSED tsk_table_collection_dump( const tsk_table_collection_t *self, const char *filename, tsk_flags_t options) { int ret = 0; FILE *file = fopen(filename, "wb"); if (file == NULL) { ret = tsk_trace_error(TSK_ERR_IO); goto out; } ret = tsk_table_collection_dumpf(self, file, options); if (ret != 0) { goto out; } if (fclose(file) != 0) { ret = tsk_trace_error(TSK_ERR_IO); goto out; } file = NULL; out: if (file != NULL) { /* Ignore any additional errors we might get when closing the file * in error conditions */ fclose(file); /* If an error occurred make sure that the filename is removed */ remove(filename); } return ret; } int TSK_WARN_UNUSED tsk_table_collection_dumpf( const tsk_table_collection_t *self, FILE *file, tsk_flags_t options) { int ret = 0; kastore_t store; char uuid[TSK_UUID_SIZE + 1]; // Must include space for trailing null. write_table_col_t format_columns[] = { { "format/name", (const void *) &TSK_FILE_FORMAT_NAME, TSK_FILE_FORMAT_NAME_LENGTH, KAS_INT8 }, { "format/version", (const void *) &(uint32_t[]) { TSK_FILE_FORMAT_VERSION_MAJOR, TSK_FILE_FORMAT_VERSION_MINOR }, 2, KAS_UINT32 }, { "sequence_length", (const void *) &self->sequence_length, 1, KAS_FLOAT64 }, { "uuid", (void *) uuid, TSK_UUID_SIZE, KAS_INT8 }, { "time_units", (void *) self->time_units, self->time_units_length, KAS_INT8 }, { "metadata", (void *) self->metadata, self->metadata_length, KAS_INT8 }, { "metadata_schema", (void *) self->metadata_schema, self->metadata_schema_length, KAS_INT8 }, { .name = NULL }, }; tsk_memset(&store, 0, sizeof(store)); ret = kastore_openf(&store, file, "w", 0); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } /* Write format data */ ret = tsk_generate_uuid(uuid, 0); if (ret != 0) { goto out; } ret = write_table_cols(&store, format_columns, options); if (ret != 0) { goto out; } /* All of these functions will set the kas_error internally, so we don't have * to modify the return value. */ ret = tsk_node_table_dump(&self->nodes, &store, options); if (ret != 0) { goto out; } ret = tsk_edge_table_dump(&self->edges, &store, options); if (ret != 0) { goto out; } ret = tsk_site_table_dump(&self->sites, &store, options); if (ret != 0) { goto out; } ret = tsk_migration_table_dump(&self->migrations, &store, options); if (ret != 0) { goto out; } ret = tsk_mutation_table_dump(&self->mutations, &store, options); if (ret != 0) { goto out; } ret = tsk_individual_table_dump(&self->individuals, &store, options); if (ret != 0) { goto out; } ret = tsk_population_table_dump(&self->populations, &store, options); if (ret != 0) { goto out; } ret = tsk_provenance_table_dump(&self->provenances, &store, options); if (ret != 0) { goto out; } ret = tsk_table_collection_dump_indexes(self, &store, options); if (ret != 0) { goto out; } ret = tsk_table_collection_dump_reference_sequence(self, &store, options); if (ret != 0) { goto out; } ret = kastore_close(&store); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; } out: /* It's safe to close a kastore twice. */ if (ret != 0) { kastore_close(&store); } return ret; } int TSK_WARN_UNUSED tsk_table_collection_simplify(tsk_table_collection_t *self, const tsk_id_t *samples, tsk_size_t num_samples, tsk_flags_t options, tsk_id_t *node_map) { int ret = 0; simplifier_t simplifier; tsk_id_t *local_samples = NULL; tsk_id_t u; /* Avoid calling to simplifier_free with uninit'd memory on error branches */ tsk_memset(&simplifier, 0, sizeof(simplifier_t)); if ((options & TSK_SIMPLIFY_KEEP_UNARY) && (options & TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS)) { ret = tsk_trace_error(TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE); goto out; } /* For now we don't bother with edge metadata, but it can easily be * implemented. */ if (self->edges.metadata_length > 0) { ret = tsk_trace_error(TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA); goto out; } if (samples == NULL) { local_samples = tsk_malloc(self->nodes.num_rows * sizeof(*local_samples)); if (local_samples == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } num_samples = 0; for (u = 0; u < (tsk_id_t) self->nodes.num_rows; u++) { if (!!(self->nodes.flags[u] & TSK_NODE_IS_SAMPLE)) { local_samples[num_samples] = u; num_samples++; } } samples = local_samples; } ret = simplifier_init(&simplifier, samples, num_samples, self, options); if (ret != 0) { goto out; } ret = simplifier_run(&simplifier, node_map); if (ret != 0) { goto out; } if (!!(options & TSK_DEBUG)) { simplifier_print_state(&simplifier, tsk_get_debug_stream()); } /* The indexes are invalidated now so drop them */ ret = tsk_table_collection_drop_index(self, 0); out: simplifier_free(&simplifier); tsk_safe_free(local_samples); return ret; } int TSK_WARN_UNUSED tsk_table_collection_link_ancestors(tsk_table_collection_t *self, tsk_id_t *samples, tsk_size_t num_samples, tsk_id_t *ancestors, tsk_size_t num_ancestors, tsk_flags_t TSK_UNUSED(options), tsk_edge_table_t *result) { int ret = 0; ancestor_mapper_t ancestor_mapper; tsk_memset(&ancestor_mapper, 0, sizeof(ancestor_mapper_t)); if (self->edges.metadata_length > 0) { ret = tsk_trace_error(TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA); goto out; } ret = ancestor_mapper_init( &ancestor_mapper, samples, num_samples, ancestors, num_ancestors, self, result); if (ret != 0) { goto out; } ret = ancestor_mapper_run(&ancestor_mapper); if (ret != 0) { goto out; } out: ancestor_mapper_free(&ancestor_mapper); return ret; } int TSK_WARN_UNUSED tsk_table_collection_ibd_within(const tsk_table_collection_t *self, tsk_identity_segments_t *result, const tsk_id_t *samples, tsk_size_t num_samples, double min_span, double max_time, tsk_flags_t options) { int ret = 0; tsk_ibd_finder_t ibd_finder; ret = tsk_identity_segments_init(result, self->nodes.num_rows, options); if (ret != 0) { goto out; } ret = tsk_ibd_finder_init(&ibd_finder, self, result, min_span, max_time); if (ret != 0) { goto out; } ret = tsk_ibd_finder_init_within(&ibd_finder, samples, num_samples); if (ret != 0) { goto out; } ret = tsk_ibd_finder_run(&ibd_finder); if (ret != 0) { goto out; } if (!!(options & TSK_DEBUG)) { tsk_ibd_finder_print_state(&ibd_finder, tsk_get_debug_stream()); } out: tsk_ibd_finder_free(&ibd_finder); return ret; } int TSK_WARN_UNUSED tsk_table_collection_ibd_between(const tsk_table_collection_t *self, tsk_identity_segments_t *result, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, double min_span, double max_time, tsk_flags_t options) { int ret = 0; tsk_ibd_finder_t ibd_finder; ret = tsk_identity_segments_init(result, self->nodes.num_rows, options); if (ret != 0) { goto out; } ret = tsk_ibd_finder_init(&ibd_finder, self, result, min_span, max_time); if (ret != 0) { goto out; } ret = tsk_ibd_finder_init_between( &ibd_finder, num_sample_sets, sample_set_sizes, sample_sets); if (ret != 0) { goto out; } ret = tsk_ibd_finder_run(&ibd_finder); if (ret != 0) { goto out; } if (!!(options & TSK_DEBUG)) { tsk_ibd_finder_print_state(&ibd_finder, tsk_get_debug_stream()); } out: tsk_ibd_finder_free(&ibd_finder); return ret; } int TSK_WARN_UNUSED tsk_table_collection_sort( tsk_table_collection_t *self, const tsk_bookmark_t *start, tsk_flags_t options) { int ret = 0; tsk_table_sorter_t sorter; ret = tsk_table_sorter_init(&sorter, self, options); if (ret != 0) { goto out; } ret = tsk_table_sorter_run(&sorter, start); if (ret != 0) { goto out; } out: tsk_table_sorter_free(&sorter); return ret; } int TSK_WARN_UNUSED tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_id_t k; tsk_id_t *nodes = NULL; tsk_table_sorter_t sorter; tsk_flags_t subset_options = options & TSK_SUBSET_KEEP_UNREFERENCED; ret = tsk_table_sorter_init(&sorter, self, 0); if (ret != 0) { goto out; } sorter.sort_mutations = tsk_table_sorter_sort_mutations; sorter.sort_individuals = tsk_table_sorter_sort_individuals_canonical; nodes = tsk_malloc(self->nodes.num_rows * sizeof(*nodes)); if (nodes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (k = 0; k < (tsk_id_t) self->nodes.num_rows; k++) { nodes[k] = k; } ret = tsk_table_collection_subset(self, nodes, self->nodes.num_rows, subset_options); if (ret != 0) { goto out; } ret = tsk_table_sorter_run(&sorter, NULL); if (ret != 0) { goto out; } out: tsk_safe_free(nodes); tsk_table_sorter_free(&sorter); return ret; } /* * Remove any sites with duplicate positions, retaining only the *first* * one. Assumes the tables have been sorted, throwing an error if not. */ int TSK_WARN_UNUSED tsk_table_collection_deduplicate_sites( tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t ret_id; tsk_size_t j; /* Map of old site IDs to new site IDs. */ tsk_id_t *site_id_map = NULL; tsk_site_table_t copy; tsk_site_t row, last_row; /* Early exit if there's 0 rows. We don't exit early for one row because * we would then skip error checking, making the semantics inconsistent. */ if (self->sites.num_rows == 0) { return 0; } /* Must allocate the site table first for tsk_site_table_free to be safe */ ret = tsk_site_table_copy(&self->sites, ©, 0); if (ret != 0) { goto out; } ret_id = tsk_table_collection_check_integrity(self, TSK_CHECK_SITE_ORDERING); if (ret_id != 0) { ret = (int) ret_id; goto out; } site_id_map = tsk_malloc(copy.num_rows * sizeof(*site_id_map)); if (site_id_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_site_table_clear(&self->sites); if (ret != 0) { goto out; } last_row.position = -1; site_id_map[0] = 0; for (j = 0; j < copy.num_rows; j++) { tsk_site_table_get_row_unsafe(©, (tsk_id_t) j, &row); if (row.position != last_row.position) { ret_id = tsk_site_table_add_row(&self->sites, row.position, row.ancestral_state, row.ancestral_state_length, row.metadata, row.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } site_id_map[j] = (tsk_id_t) self->sites.num_rows - 1; last_row = row; } if (self->sites.num_rows < copy.num_rows) { // Remap sites in the mutation table // (but only if there's been any changed sites) for (j = 0; j < self->mutations.num_rows; j++) { self->mutations.site[j] = site_id_map[self->mutations.site[j]]; } } ret = 0; out: tsk_site_table_free(©); tsk_safe_free(site_id_map); return ret; } int TSK_WARN_UNUSED tsk_table_collection_compute_mutation_parents( tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_mutation_table_t *mutations = &self->mutations; tsk_id_t *parent_backup = NULL; bool restore_parents = false; if (!(options & TSK_NO_CHECK_INTEGRITY)) { if (mutations->num_rows > 0) { /* We need to wipe the parent column before computing, as otherwise invalid * parents can cause integrity checks to fail. We take a copy to restore on * error */ parent_backup = tsk_malloc(mutations->num_rows * sizeof(*parent_backup)); if (parent_backup == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memcpy(parent_backup, mutations->parent, mutations->num_rows * sizeof(*parent_backup)); /* Set the parent pointers to TSK_NULL */ tsk_memset(mutations->parent, 0xff, mutations->num_rows * sizeof(*mutations->parent)); restore_parents = true; } /* Safe to cast here as we're not counting trees */ ret = (int) tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); if (ret < 0) { goto out; } } ret = tsk_table_collection_compute_mutation_parents_to_array( self, self->mutations.parent); if (ret != 0) { goto out; } out: if (ret != 0 && restore_parents) { tsk_memcpy(mutations->parent, parent_backup, mutations->num_rows * sizeof(*parent_backup)); } tsk_safe_free(parent_backup); return ret; } int TSK_WARN_UNUSED tsk_table_collection_compute_mutation_times( tsk_table_collection_t *self, double *random, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t num_trees; const tsk_id_t *restrict I = self->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->indexes.edge_removal_order; const tsk_edge_table_t edges = self->edges; const tsk_node_table_t nodes = self->nodes; const tsk_site_table_t sites = self->sites; const tsk_mutation_table_t mutations = self->mutations; const tsk_id_t M = (tsk_id_t) edges.num_rows; tsk_id_t tj, tk; tsk_id_t *parent = NULL; double *numerator = NULL; double *denominator = NULL; tsk_id_t u; double left, right, parent_time; tsk_id_t site; /* Using unsigned values here avoids potentially undefined behaviour */ tsk_size_t j, mutation, first_mutation; tsk_bookmark_t skip_edges = { 0, 0, self->edges.num_rows, 0, 0, 0, 0, 0 }; /* The random param is for future usage */ if (random != NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } /* First set the times to TSK_UNKNOWN_TIME so that check will succeed */ for (j = 0; j < mutations.num_rows; j++) { mutations.time[j] = TSK_UNKNOWN_TIME; } /* TSK_CHECK_MUTATION_PARENTS isn't needed here as we're not using the parents */ num_trees = tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); if (num_trees < 0) { ret = (int) num_trees; goto out; } parent = tsk_malloc(nodes.num_rows * sizeof(*parent)); numerator = tsk_malloc(nodes.num_rows * sizeof(*numerator)); denominator = tsk_malloc(nodes.num_rows * sizeof(*denominator)); if (parent == NULL || numerator == NULL || denominator == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(parent, 0xff, nodes.num_rows * sizeof(*parent)); tsk_memset(numerator, 0, nodes.num_rows * sizeof(*numerator)); tsk_memset(denominator, 0, nodes.num_rows * sizeof(*denominator)); tj = 0; tk = 0; site = 0; mutation = 0; left = 0; while (tj < M || left < self->sequence_length) { while (tk < M && edges.right[O[tk]] == left) { parent[edges.child[O[tk]]] = TSK_NULL; tk++; } while (tj < M && edges.left[I[tj]] == left) { parent[edges.child[I[tj]]] = edges.parent[I[tj]]; tj++; } right = self->sequence_length; if (tj < M) { right = TSK_MIN(right, edges.left[I[tj]]); } if (tk < M) { right = TSK_MIN(right, edges.right[O[tk]]); } /* Tree is now ready. We look at each site on this tree in turn */ while (site < (tsk_id_t) sites.num_rows && sites.position[site] < right) { first_mutation = mutation; /* Count how many mutations each edge has to get our denominator */ while (mutation < mutations.num_rows && mutations.site[mutation] == site) { denominator[mutations.node[mutation]]++; mutation++; } /* Go over the mutations again assigning times. As the sorting requirements guarantee that parents are before children, we assign oldest first */ for (j = first_mutation; j < mutation; j++) { u = mutations.node[j]; numerator[u]++; if (parent[u] == TSK_NULL) { /* This mutation is above a root */ mutations.time[j] = nodes.time[u]; } else { parent_time = nodes.time[parent[u]]; mutations.time[j] = parent_time - (parent_time - nodes.time[u]) * numerator[u] / (denominator[u] + 1); } } /* Reset the book-keeping for the next site */ for (j = first_mutation; j < mutation; j++) { u = mutations.node[j]; numerator[u] = 0; denominator[u] = 0; } site++; } /* Move on to the next tree */ left = right; } /* Now that mutations have times their sort order may have been invalidated, so * re-sort. Safe to cast the result to an int here because we're not counting * trees. */ ret = (int) tsk_table_collection_check_integrity(self, TSK_CHECK_MUTATION_ORDERING); if (ret == TSK_ERR_UNSORTED_MUTATIONS) { ret = tsk_table_collection_sort(self, &skip_edges, 0); if (ret != 0) { goto out; } } else if (ret < 0) { goto out; } out: tsk_safe_free(parent); tsk_safe_free(numerator); tsk_safe_free(denominator); return ret; } int TSK_WARN_UNUSED tsk_table_collection_delete_older( tsk_table_collection_t *self, double time, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_edge_t edge; tsk_mutation_t mutation; tsk_migration_t migration; tsk_edge_table_t edges; tsk_mutation_table_t mutations; tsk_migration_table_t migrations; const double *restrict node_time = self->nodes.time; tsk_id_t j, ret_id, parent; double mutation_time; tsk_id_t *mutation_map = NULL; memset(&edges, 0, sizeof(edges)); memset(&mutations, 0, sizeof(mutations)); memset(&migrations, 0, sizeof(migrations)); ret = tsk_edge_table_copy(&self->edges, &edges, 0); if (ret != 0) { goto out; } ret = tsk_edge_table_clear(&self->edges); if (ret != 0) { goto out; } for (j = 0; j < (tsk_id_t) edges.num_rows; j++) { tsk_edge_table_get_row_unsafe(&edges, j, &edge); if (node_time[edge.parent] <= time) { ret_id = tsk_edge_table_add_row(&self->edges, edge.left, edge.right, edge.parent, edge.child, edge.metadata, edge.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } } /* Calling x_table_free multiple times is safe, so get rid of the * extra edge table memory as soon as we can. */ tsk_edge_table_free(&edges); mutation_map = tsk_malloc(self->mutations.num_rows * sizeof(*mutation_map)); if (mutation_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_mutation_table_copy(&self->mutations, &mutations, 0); if (ret != 0) { goto out; } ret = tsk_mutation_table_clear(&self->mutations); if (ret != 0) { goto out; } for (j = 0; j < (tsk_id_t) mutations.num_rows; j++) { tsk_mutation_table_get_row_unsafe(&mutations, j, &mutation); mutation_time = tsk_is_unknown_time(mutation.time) ? node_time[mutation.node] : mutation.time; mutation_map[j] = TSK_NULL; if (mutation_time < time) { ret_id = tsk_mutation_table_add_row(&self->mutations, mutation.site, mutation.node, mutation.parent, mutation.time, mutation.derived_state, mutation.derived_state_length, mutation.metadata, mutation.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } mutation_map[j] = ret_id; } } tsk_mutation_table_free(&mutations); for (j = 0; j < (tsk_id_t) self->mutations.num_rows; j++) { parent = self->mutations.parent[j]; if (parent != TSK_NULL) { self->mutations.parent[j] = mutation_map[parent]; } } ret = tsk_migration_table_copy(&self->migrations, &migrations, 0); if (ret != 0) { goto out; } ret = tsk_migration_table_clear(&self->migrations); if (ret != 0) { goto out; } for (j = 0; j < (tsk_id_t) migrations.num_rows; j++) { tsk_migration_table_get_row_unsafe(&migrations, j, &migration); if (migration.time < time) { ret_id = tsk_migration_table_add_row(&self->migrations, migration.left, migration.right, migration.node, migration.source, migration.dest, migration.time, migration.metadata, migration.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } } tsk_migration_table_free(&migrations); out: tsk_edge_table_free(&edges); tsk_mutation_table_free(&mutations); tsk_migration_table_free(&migrations); tsk_safe_free(mutation_map); return ret; } int tsk_table_collection_record_num_rows( const tsk_table_collection_t *self, tsk_bookmark_t *position) { position->individuals = self->individuals.num_rows; position->nodes = self->nodes.num_rows; position->edges = self->edges.num_rows; position->migrations = self->migrations.num_rows; position->sites = self->sites.num_rows; position->mutations = self->mutations.num_rows; position->populations = self->populations.num_rows; position->provenances = self->provenances.num_rows; return 0; } int TSK_WARN_UNUSED tsk_table_collection_truncate(tsk_table_collection_t *tables, tsk_bookmark_t *position) { int ret = 0; ret = tsk_table_collection_drop_index(tables, 0); if (ret != 0) { goto out; } ret = tsk_individual_table_truncate(&tables->individuals, position->individuals); if (ret != 0) { goto out; } ret = tsk_node_table_truncate(&tables->nodes, position->nodes); if (ret != 0) { goto out; } ret = tsk_edge_table_truncate(&tables->edges, position->edges); if (ret != 0) { goto out; } ret = tsk_migration_table_truncate(&tables->migrations, position->migrations); if (ret != 0) { goto out; } ret = tsk_site_table_truncate(&tables->sites, position->sites); if (ret != 0) { goto out; } ret = tsk_mutation_table_truncate(&tables->mutations, position->mutations); if (ret != 0) { goto out; } ret = tsk_population_table_truncate(&tables->populations, position->populations); if (ret != 0) { goto out; } ret = tsk_provenance_table_truncate(&tables->provenances, position->provenances); if (ret != 0) { goto out; } out: return ret; } int TSK_WARN_UNUSED tsk_table_collection_clear(tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; bool clear_provenance = !!(options & TSK_CLEAR_PROVENANCE); bool clear_metadata_schemas = !!(options & TSK_CLEAR_METADATA_SCHEMAS); bool clear_ts_metadata = !!(options & TSK_CLEAR_TS_METADATA_AND_SCHEMA); tsk_bookmark_t rows_to_retain = { .provenances = clear_provenance ? 0 : self->provenances.num_rows }; ret = tsk_table_collection_truncate(self, &rows_to_retain); if (ret != 0) { goto out; } if (clear_metadata_schemas) { ret = tsk_individual_table_set_metadata_schema(&self->individuals, "", 0); if (ret != 0) { goto out; } ret = tsk_node_table_set_metadata_schema(&self->nodes, "", 0); if (ret != 0) { goto out; } ret = tsk_edge_table_set_metadata_schema(&self->edges, "", 0); if (ret != 0) { goto out; } ret = tsk_migration_table_set_metadata_schema(&self->migrations, "", 0); if (ret != 0) { goto out; } ret = tsk_site_table_set_metadata_schema(&self->sites, "", 0); if (ret != 0) { goto out; } ret = tsk_mutation_table_set_metadata_schema(&self->mutations, "", 0); if (ret != 0) { goto out; } ret = tsk_population_table_set_metadata_schema(&self->populations, "", 0); if (ret != 0) { goto out; } } if (clear_ts_metadata) { ret = tsk_table_collection_set_metadata(self, "", 0); if (ret != 0) { goto out; } ret = tsk_table_collection_set_metadata_schema(self, "", 0); if (ret != 0) { goto out; } } out: return ret; } static int tsk_table_collection_add_and_remap_node(tsk_table_collection_t *self, const tsk_table_collection_t *other, tsk_id_t node_id, tsk_id_t *individual_map, tsk_id_t *population_map, tsk_id_t *node_map, bool add_populations) { int ret = 0; tsk_id_t ret_id, new_ind, new_pop; tsk_node_t node; tsk_individual_t ind; tsk_population_t pop; ret = tsk_node_table_get_row(&other->nodes, node_id, &node); if (ret < 0) { goto out; } new_ind = TSK_NULL; if (node.individual != TSK_NULL) { if (individual_map[node.individual] == TSK_NULL) { ret = tsk_individual_table_get_row( &other->individuals, node.individual, &ind); if (ret < 0) { goto out; } ret_id = tsk_individual_table_add_row(&self->individuals, ind.flags, ind.location, ind.location_length, ind.parents, ind.parents_length, ind.metadata, ind.metadata_length); if (ret < 0) { ret = (int) ret_id; goto out; } individual_map[node.individual] = ret_id; } new_ind = individual_map[node.individual]; } new_pop = TSK_NULL; if (node.population != TSK_NULL) { // keep same pops if add_populations is False if (!add_populations) { population_map[node.population] = node.population; } if (population_map[node.population] == TSK_NULL) { ret = tsk_population_table_get_row( &other->populations, node.population, &pop); if (ret < 0) { goto out; } ret_id = tsk_population_table_add_row( &self->populations, pop.metadata, pop.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } population_map[node.population] = ret_id; } new_pop = population_map[node.population]; } ret_id = tsk_node_table_add_row(&self->nodes, node.flags, node.time, new_pop, new_ind, node.metadata, node.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } node_map[node.id] = ret_id; out: return ret; } int TSK_WARN_UNUSED tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, tsk_size_t num_nodes, tsk_flags_t options) { int ret = 0; tsk_id_t ret_id, j, k, parent_ind, new_parent, new_child, new_node, site_id; tsk_size_t num_parents; tsk_individual_t ind; tsk_edge_t edge; tsk_id_t *node_map = NULL; tsk_id_t *individual_map = NULL; tsk_id_t *population_map = NULL; tsk_id_t *site_map = NULL; tsk_id_t *mutation_map = NULL; tsk_table_collection_t tables; tsk_population_t pop; tsk_site_t site; tsk_mutation_t mut; bool keep_unreferenced = !!(options & TSK_SUBSET_KEEP_UNREFERENCED); bool no_change_populations = !!(options & TSK_SUBSET_NO_CHANGE_POPULATIONS); ret = tsk_table_collection_copy(self, &tables, 0); if (ret != 0) { goto out; } /* Not calling TSK_CHECK_TREES so casting to int is safe */ ret = (int) tsk_table_collection_check_integrity(self, 0); if (ret != 0) { goto out; } ret = tsk_table_collection_clear(self, 0); if (ret != 0) { goto out; } node_map = tsk_malloc(tables.nodes.num_rows * sizeof(*node_map)); individual_map = tsk_malloc(tables.individuals.num_rows * sizeof(*individual_map)); population_map = tsk_malloc(tables.populations.num_rows * sizeof(*population_map)); site_map = tsk_malloc(tables.sites.num_rows * sizeof(*site_map)); mutation_map = tsk_malloc(tables.mutations.num_rows * sizeof(*mutation_map)); if (node_map == NULL || individual_map == NULL || population_map == NULL || site_map == NULL || mutation_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(node_map, 0xff, tables.nodes.num_rows * sizeof(*node_map)); tsk_memset( individual_map, 0xff, tables.individuals.num_rows * sizeof(*individual_map)); tsk_memset( population_map, 0xff, tables.populations.num_rows * sizeof(*population_map)); tsk_memset(site_map, 0xff, tables.sites.num_rows * sizeof(*site_map)); tsk_memset(mutation_map, 0xff, tables.mutations.num_rows * sizeof(*mutation_map)); if (no_change_populations) { ret = tsk_population_table_copy( &tables.populations, &self->populations, TSK_NO_INIT); if (ret < 0) { goto out; } for (k = 0; k < (tsk_id_t) tables.populations.num_rows; k++) { population_map[k] = k; } } // First do individuals so they stay in the same order. // So we can remap individual parents and not rely on sortedness, // we first check who to keep; then build the individual map, and // finally populate the tables. if (keep_unreferenced) { for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { // put a non-NULL value here; fill in the actual order next individual_map[k] = 0; } } else { for (k = 0; k < (tsk_id_t) num_nodes; k++) { if (nodes[k] < 0 || nodes[k] >= (tsk_id_t) tables.nodes.num_rows) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } j = tables.nodes.individual[nodes[k]]; if (j != TSK_NULL) { individual_map[j] = 0; } } } j = 0; for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { if (individual_map[k] != TSK_NULL) { individual_map[k] = j; j++; } } for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { if (individual_map[k] != TSK_NULL) { tsk_individual_table_get_row_unsafe(&tables.individuals, k, &ind); num_parents = 0; for (j = 0; j < (tsk_id_t) ind.parents_length; j++) { parent_ind = ind.parents[j]; new_parent = parent_ind; if (parent_ind != TSK_NULL) { new_parent = individual_map[parent_ind]; } if ((parent_ind == TSK_NULL) || (new_parent != TSK_NULL)) { /* Beware: this modifies the parents column of tables.individuals * in-place! But it's OK as we don't use it again. */ ind.parents[num_parents] = new_parent; num_parents++; } } ret_id = tsk_individual_table_add_row(&self->individuals, ind.flags, ind.location, ind.location_length, ind.parents, num_parents, ind.metadata, ind.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } tsk_bug_assert(individual_map[k] == ret_id); } } // Nodes and populations for (k = 0; k < (tsk_id_t) num_nodes; k++) { ret = tsk_table_collection_add_and_remap_node( self, &tables, nodes[k], individual_map, population_map, node_map, true); if (ret < 0) { goto out; } } /* TODO: Subset the migrations table. We would need to make sure * that we don't remove populations that are referenced, so it would * need to be done before the next code block. */ if (tables.migrations.num_rows != 0) { ret = tsk_trace_error(TSK_ERR_MIGRATIONS_NOT_SUPPORTED); goto out; } if (keep_unreferenced) { // Keep unused populations for (k = 0; k < (tsk_id_t) tables.populations.num_rows; k++) { if (population_map[k] == TSK_NULL) { tsk_population_table_get_row_unsafe(&tables.populations, k, &pop); ret_id = tsk_population_table_add_row( &self->populations, pop.metadata, pop.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } } } // Edges for (k = 0; k < (tsk_id_t) tables.edges.num_rows; k++) { tsk_edge_table_get_row_unsafe(&tables.edges, k, &edge); new_parent = node_map[edge.parent]; new_child = node_map[edge.child]; if ((new_parent != TSK_NULL) && (new_child != TSK_NULL)) { ret_id = tsk_edge_table_add_row(&self->edges, edge.left, edge.right, new_parent, new_child, edge.metadata, edge.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } } // Mutations and sites // Make a first pass through to build the mutation_map so that // mutation parent can be remapped even if the table is not in order. j = 0; for (k = 0; k < (tsk_id_t) tables.mutations.num_rows; k++) { if (node_map[tables.mutations.node[k]] != TSK_NULL) { mutation_map[k] = j; j++; site_id = tables.mutations.site[k]; if (site_map[site_id] == TSK_NULL) { // Insert a temporary non-NULL value site_map[site_id] = 1; } } } // Keep retained sites in their original order j = 0; for (k = 0; k < (tsk_id_t) tables.sites.num_rows; k++) { if (keep_unreferenced || site_map[k] != TSK_NULL) { tsk_site_table_get_row_unsafe(&tables.sites, k, &site); ret_id = tsk_site_table_add_row(&self->sites, site.position, site.ancestral_state, site.ancestral_state_length, site.metadata, site.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } site_map[k] = j; j++; } } for (k = 0; k < (tsk_id_t) tables.mutations.num_rows; k++) { tsk_mutation_table_get_row_unsafe(&tables.mutations, k, &mut); new_node = node_map[mut.node]; if (new_node != TSK_NULL) { new_parent = TSK_NULL; if (mut.parent != TSK_NULL) { new_parent = mutation_map[mut.parent]; } ret_id = tsk_mutation_table_add_row(&self->mutations, site_map[mut.site], new_node, new_parent, mut.time, mut.derived_state, mut.derived_state_length, mut.metadata, mut.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } tsk_bug_assert(mutation_map[mut.id] == ret_id); } if (ret < 0) { goto out; } } ret = 0; out: tsk_safe_free(node_map); tsk_safe_free(individual_map); tsk_safe_free(population_map); tsk_safe_free(site_map); tsk_safe_free(mutation_map); tsk_table_collection_free(&tables); return ret; } static int tsk_check_subset_equality(tsk_table_collection_t *self, const tsk_table_collection_t *other, const tsk_id_t *other_node_mapping, tsk_size_t num_shared_nodes) { int ret = 0; tsk_id_t k, i; tsk_id_t *self_nodes = NULL; tsk_id_t *other_nodes = NULL; tsk_table_collection_t self_copy; tsk_table_collection_t other_copy; tsk_memset(&self_copy, 0, sizeof(self_copy)); tsk_memset(&other_copy, 0, sizeof(other_copy)); self_nodes = tsk_malloc(num_shared_nodes * sizeof(*self_nodes)); other_nodes = tsk_malloc(num_shared_nodes * sizeof(*other_nodes)); if (self_nodes == NULL || other_nodes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } i = 0; for (k = 0; k < (tsk_id_t) other->nodes.num_rows; k++) { if (other_node_mapping[k] != TSK_NULL) { self_nodes[i] = other_node_mapping[k]; other_nodes[i] = k; i++; } } ret = tsk_table_collection_copy(self, &self_copy, 0); if (ret != 0) { goto out; } ret = tsk_table_collection_copy(other, &other_copy, 0); if (ret != 0) { goto out; } ret = tsk_table_collection_subset(&self_copy, self_nodes, num_shared_nodes, 0); if (ret != 0) { goto out; } ret = tsk_table_collection_subset(&other_copy, other_nodes, num_shared_nodes, 0); if (ret != 0) { goto out; } ret = tsk_table_collection_canonicalise(&self_copy, 0); if (ret != 0) { goto out; } ret = tsk_table_collection_canonicalise(&other_copy, 0); if (ret != 0) { goto out; } if (!tsk_table_collection_equals(&self_copy, &other_copy, TSK_CMP_IGNORE_TS_METADATA | TSK_CMP_IGNORE_PROVENANCE | TSK_CMP_IGNORE_REFERENCE_SEQUENCE)) { ret = tsk_trace_error(TSK_ERR_UNION_DIFF_HISTORIES); goto out; } out: tsk_table_collection_free(&self_copy); tsk_table_collection_free(&other_copy); tsk_safe_free(other_nodes); tsk_safe_free(self_nodes); return ret; } int TSK_WARN_UNUSED tsk_table_collection_union(tsk_table_collection_t *self, const tsk_table_collection_t *other, const tsk_id_t *other_node_mapping, tsk_flags_t options) { int ret = 0; tsk_id_t ret_id, k, i, new_parent, new_child; tsk_size_t num_shared_nodes = 0; tsk_size_t num_individuals_self = self->individuals.num_rows; tsk_edge_t edge; tsk_mutation_t mut; tsk_site_t site; tsk_id_t *node_map = NULL; tsk_id_t *individual_map = NULL; tsk_id_t *population_map = NULL; tsk_id_t *site_map = NULL; bool add_populations = !(options & TSK_UNION_NO_ADD_POP); bool check_shared_portion = !(options & TSK_UNION_NO_CHECK_SHARED); bool all_edges = !!(options & TSK_UNION_ALL_EDGES); bool all_mutations = !!(options & TSK_UNION_ALL_MUTATIONS); /* Not calling TSK_CHECK_TREES so casting to int is safe */ ret = (int) tsk_table_collection_check_integrity(self, 0); if (ret != 0) { goto out; } ret = (int) tsk_table_collection_check_integrity(other, 0); if (ret != 0) { goto out; } for (k = 0; k < (tsk_id_t) other->nodes.num_rows; k++) { if (other_node_mapping[k] >= (tsk_id_t) self->nodes.num_rows || other_node_mapping[k] < TSK_NULL) { ret = tsk_trace_error(TSK_ERR_UNION_BAD_MAP); goto out; } if (other_node_mapping[k] != TSK_NULL) { num_shared_nodes++; } } if (check_shared_portion) { ret = tsk_check_subset_equality( self, other, other_node_mapping, num_shared_nodes); if (ret != 0) { goto out; } } // Maps relating the IDs in other to the new IDs in self. node_map = tsk_malloc(other->nodes.num_rows * sizeof(*node_map)); individual_map = tsk_malloc(other->individuals.num_rows * sizeof(*individual_map)); population_map = tsk_malloc(other->populations.num_rows * sizeof(*population_map)); site_map = tsk_malloc(other->sites.num_rows * sizeof(*site_map)); if (node_map == NULL || individual_map == NULL || population_map == NULL || site_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(node_map, 0xff, other->nodes.num_rows * sizeof(*node_map)); tsk_memset( individual_map, 0xff, other->individuals.num_rows * sizeof(*individual_map)); tsk_memset( population_map, 0xff, other->populations.num_rows * sizeof(*population_map)); tsk_memset(site_map, 0xff, other->sites.num_rows * sizeof(*site_map)); /* We have to map the individuals who are linked to nodes in the intersection first as otherwise an individual linked to one node in the intersection and one in `other` would be duplicated. We assume that the individual in `self` takes priority. */ for (k = 0; k < (tsk_id_t) other->nodes.num_rows; k++) { if (other_node_mapping[k] != TSK_NULL && other->nodes.individual[k] != TSK_NULL) { individual_map[other->nodes.individual[k]] = self->nodes.individual[other_node_mapping[k]]; } } // nodes, individuals, populations for (k = 0; k < (tsk_id_t) other->nodes.num_rows; k++) { if (other_node_mapping[k] != TSK_NULL) { node_map[k] = other_node_mapping[k]; } else { ret = tsk_table_collection_add_and_remap_node(self, other, k, individual_map, population_map, node_map, add_populations); if (ret < 0) { goto out; } } } /* Now we know the full individual map we can remap the parents of the new * individuals*/ for (k = (tsk_id_t) self->individuals.parents_offset[num_individuals_self]; k < (tsk_id_t) self->individuals.parents_length; k++) { if (self->individuals.parents[k] != TSK_NULL) { self->individuals.parents[k] = individual_map[self->individuals.parents[k]]; } } // edges for (k = 0; k < (tsk_id_t) other->edges.num_rows; k++) { tsk_edge_table_get_row_unsafe(&other->edges, k, &edge); if (all_edges || (other_node_mapping[edge.parent] == TSK_NULL) || (other_node_mapping[edge.child] == TSK_NULL)) { new_parent = node_map[edge.parent]; new_child = node_map[edge.child]; ret_id = tsk_edge_table_add_row(&self->edges, edge.left, edge.right, new_parent, new_child, edge.metadata, edge.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } } // sites // first do the "disjoint" (all_mutations) case, where we just add all sites; // otherwise we want to just add sites for new mutations if (all_mutations) { for (k = 0; k < (tsk_id_t) other->sites.num_rows; k++) { tsk_site_table_get_row_unsafe(&other->sites, k, &site); ret_id = tsk_site_table_add_row(&self->sites, site.position, site.ancestral_state, site.ancestral_state_length, site.metadata, site.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } site_map[site.id] = ret_id; } } // mutations (and maybe sites) i = 0; for (k = 0; k < (tsk_id_t) other->sites.num_rows; k++) { tsk_site_table_get_row_unsafe(&other->sites, k, &site); while ((i < (tsk_id_t) other->mutations.num_rows) && (other->mutations.site[i] == site.id)) { tsk_mutation_table_get_row_unsafe(&other->mutations, i, &mut); if (all_mutations || (other_node_mapping[mut.node] == TSK_NULL)) { if (site_map[site.id] == TSK_NULL) { ret_id = tsk_site_table_add_row(&self->sites, site.position, site.ancestral_state, site.ancestral_state_length, site.metadata, site.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } site_map[site.id] = ret_id; } // the parents will be recomputed later new_parent = TSK_NULL; ret_id = tsk_mutation_table_add_row(&self->mutations, site_map[site.id], node_map[mut.node], new_parent, mut.time, mut.derived_state, mut.derived_state_length, mut.metadata, mut.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } i++; } } /* TODO: Union of the Migrations Table. The only hindrance to performing the * union operation on Migrations Tables is that tsk_table_collection_sort * does not sort migrations by time, and instead throws an error. */ if (self->migrations.num_rows != 0 || other->migrations.num_rows != 0) { ret = tsk_trace_error(TSK_ERR_MIGRATIONS_NOT_SUPPORTED); goto out; } // sorting, deduplicating, and computing parents ret = tsk_table_collection_sort(self, 0, 0); if (ret < 0) { goto out; } ret = tsk_table_collection_deduplicate_sites(self, 0); if (ret < 0) { goto out; } // need to sort again since after deduplicating sites, mutations // may not be sorted by time within sites ret = tsk_table_collection_sort(self, 0, 0); if (ret < 0) { goto out; } ret = tsk_table_collection_build_index(self, 0); if (ret < 0) { goto out; } ret = tsk_table_collection_compute_mutation_parents(self, 0); if (ret < 0) { goto out; } out: tsk_safe_free(node_map); tsk_safe_free(individual_map); tsk_safe_free(population_map); tsk_safe_free(site_map); return ret; } static int cmp_edge_cl(const void *a, const void *b) { const tsk_edge_t *ia = (const tsk_edge_t *) a; const tsk_edge_t *ib = (const tsk_edge_t *) b; int ret = (ia->parent > ib->parent) - (ia->parent < ib->parent); if (ret == 0) { ret = (ia->child > ib->child) - (ia->child < ib->child); if (ret == 0) { ret = (ia->left > ib->left) - (ia->left < ib->left); } } return ret; } /* Squash the edges in the specified array in place. The output edges will * be sorted by (child_id, left). */ int TSK_WARN_UNUSED tsk_squash_edges(tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output_edges) { int ret = 0; tsk_size_t j, k, l; if (num_edges < 2) { *num_output_edges = num_edges; return ret; } qsort(edges, (size_t) num_edges, sizeof(tsk_edge_t), cmp_edge_cl); j = 0; l = 0; for (k = 1; k < num_edges; k++) { if (edges[k - 1].metadata_length > 0) { ret = tsk_trace_error(TSK_ERR_CANT_PROCESS_EDGES_WITH_METADATA); goto out; } /* Check for overlapping edges. */ if (edges[k - 1].parent == edges[k].parent && edges[k - 1].child == edges[k].child && edges[k - 1].right > edges[k].left) { ret = tsk_trace_error(TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN); goto out; } /* Add squashed edge. */ if (edges[k - 1].parent != edges[k].parent || edges[k - 1].right != edges[k].left || edges[j].child != edges[k].child) { edges[l].left = edges[j].left; edges[l].right = edges[k - 1].right; edges[l].parent = edges[j].parent; edges[l].child = edges[j].child; j = k; l++; } } edges[l].left = edges[j].left; edges[l].right = edges[k - 1].right; edges[l].parent = edges[j].parent; edges[l].child = edges[j].child; *num_output_edges = (tsk_size_t) l + 1; out: return ret; } ================================================ FILE: c/tskit/tables.h ================================================ /* * MIT License * * Copyright (c) 2019-2024 Tskit Developers * Copyright (c) 2017-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /** * @file tables.h * @brief Tskit Tables API. */ #ifndef TSK_TABLES_H #define TSK_TABLES_H #ifdef __cplusplus extern "C" { #endif #include #include #include #include #include /****************************************************************************/ /* Definitions for the basic objects */ /****************************************************************************/ /** @brief A single individual defined by a row in the individual table. @rst See the :ref:`data model ` section for the definition of an individual and its properties. @endrst */ typedef struct { /** @brief Non-negative ID value corresponding to table row. */ tsk_id_t id; /** @brief Bitwise flags. */ tsk_flags_t flags; /** @brief Spatial location. The number of dimensions is defined by * ``location_length``. */ const double *location; /** @brief Number of spatial dimensions. */ tsk_size_t location_length; /** @brief IDs of the parents. The number of parents given by ``parents_length``*/ tsk_id_t *parents; /** @brief Number of parents. */ tsk_size_t parents_length; /** @brief Metadata. */ const char *metadata; /** @brief Size of the metadata in bytes. */ tsk_size_t metadata_length; /** @brief An array of the nodes associated with this individual */ const tsk_id_t *nodes; /** @brief The number of nodes associated with this individual*/ tsk_size_t nodes_length; } tsk_individual_t; /** @brief A single node defined by a row in the node table. @rst See the :ref:`data model ` section for the definition of a node and its properties. @endrst */ typedef struct { /** @brief Non-negative ID value corresponding to table row. */ tsk_id_t id; /** @brief Bitwise flags. */ tsk_flags_t flags; /** @brief Time. */ double time; /** @brief Population ID. */ tsk_id_t population; /** @brief Individual ID. */ tsk_id_t individual; /** @brief Metadata. */ const char *metadata; /** @brief Size of the metadata in bytes. */ tsk_size_t metadata_length; } tsk_node_t; /** @brief A single edge defined by a row in the edge table. @rst See the :ref:`data model ` section for the definition of an edge and its properties. @endrst */ typedef struct { /** @brief Non-negative ID value corresponding to table row. */ tsk_id_t id; /** @brief Parent node ID. */ tsk_id_t parent; /** @brief Child node ID. */ tsk_id_t child; /** @brief Left coordinate. */ double left; /** @brief Right coordinate. */ double right; /** @brief Metadata. */ const char *metadata; /** @brief Size of the metadata in bytes. */ tsk_size_t metadata_length; } tsk_edge_t; /** @brief A single mutation defined by a row in the mutation table. @rst See the :ref:`data model ` section for the definition of a mutation and its properties. @endrst */ typedef struct { /** @brief Non-negative ID value corresponding to table row. */ tsk_id_t id; /** @brief Site ID. */ tsk_id_t site; /** @brief Node ID. */ tsk_id_t node; /** @brief Parent mutation ID. */ tsk_id_t parent; /** @brief Mutation time. */ double time; /** @brief Derived state. */ const char *derived_state; /** @brief Size of the derived state in bytes. */ tsk_size_t derived_state_length; /** @brief Metadata. */ const char *metadata; /** @brief Size of the metadata in bytes. */ tsk_size_t metadata_length; /** @brief The ID of the edge that this mutation lies on, or TSK_NULL if there is no corresponding edge.*/ tsk_id_t edge; /** @brief Inherited state. */ const char *inherited_state; /** @brief Size of the inherited state in bytes. */ tsk_size_t inherited_state_length; } tsk_mutation_t; /** @brief A single site defined by a row in the site table. @rst See the :ref:`data model ` section for the definition of a site and its properties. @endrst */ typedef struct { /** @brief Non-negative ID value corresponding to table row. */ tsk_id_t id; /** @brief Position coordinate. */ double position; /** @brief Ancestral state. */ const char *ancestral_state; /** @brief Ancestral state length in bytes. */ tsk_size_t ancestral_state_length; /** @brief Metadata. */ const char *metadata; /** @brief Metadata length in bytes. */ tsk_size_t metadata_length; /** @brief An array of this site's mutations */ const tsk_mutation_t *mutations; /** @brief The number of mutations at this site */ tsk_size_t mutations_length; } tsk_site_t; /** @brief A single migration defined by a row in the migration table. @rst See the :ref:`data model ` section for the definition of a migration and its properties. @endrst */ typedef struct { /** @brief Non-negative ID value corresponding to table row. */ tsk_id_t id; /** @brief Source population ID. */ tsk_id_t source; /** @brief Destination population ID. */ tsk_id_t dest; /** @brief Node ID. */ tsk_id_t node; /** @brief Left coordinate. */ double left; /** @brief Right coordinate. */ double right; /** @brief Time. */ double time; /** @brief Metadata. */ const char *metadata; /** @brief Size of the metadata in bytes. */ tsk_size_t metadata_length; } tsk_migration_t; /** @brief A single population defined by a row in the population table. @rst See the :ref:`data model ` section for the definition of a population and its properties. @endrst */ typedef struct { /** @brief Non-negative ID value corresponding to table row. */ tsk_id_t id; /** @brief Metadata. */ const char *metadata; /** @brief Metadata length in bytes. */ tsk_size_t metadata_length; } tsk_population_t; /** @brief A single provenance defined by a row in the provenance table. @rst See the :ref:`data model ` section for the definition of a provenance object and its properties. See the :ref:`sec_provenance` section for more information on how provenance records should be structured. @endrst */ typedef struct { /** @brief Non-negative ID value corresponding to table row. */ tsk_id_t id; /** @brief The timestamp. */ const char *timestamp; /** @brief The timestamp length in bytes. */ tsk_size_t timestamp_length; /** @brief The record. */ const char *record; /** @brief The record length in bytes. */ tsk_size_t record_length; } tsk_provenance_t; /****************************************************************************/ /* Table definitions */ /****************************************************************************/ /** @brief The individual table. @rst See the individual :ref:`table definition ` for details of the columns in this table. @endrst */ typedef struct { /** @brief The number of rows in this table. */ tsk_size_t num_rows; tsk_size_t max_rows; tsk_size_t max_rows_increment; /** @brief The total length of the location column. */ tsk_size_t location_length; tsk_size_t max_location_length; tsk_size_t max_location_length_increment; /** @brief The total length of the parent column. */ tsk_size_t parents_length; tsk_size_t max_parents_length; tsk_size_t max_parents_length_increment; /** @brief The total length of the metadata column. */ tsk_size_t metadata_length; tsk_size_t max_metadata_length; tsk_size_t max_metadata_length_increment; tsk_size_t metadata_schema_length; /** @brief The flags column. */ tsk_flags_t *flags; /** @brief The location column. */ double *location; /** @brief The location_offset column. */ tsk_size_t *location_offset; /** @brief The parents column. */ tsk_id_t *parents; /** @brief The parents_offset column. */ tsk_size_t *parents_offset; /** @brief The metadata column. */ char *metadata; /** @brief The metadata_offset column. */ tsk_size_t *metadata_offset; /** @brief The metadata schema */ char *metadata_schema; } tsk_individual_table_t; /** @brief The node table. @rst See the node :ref:`table definition ` for details of the columns in this table. @endrst */ typedef struct { /** @brief The number of rows in this table. */ tsk_size_t num_rows; tsk_size_t max_rows; tsk_size_t max_rows_increment; /** @brief The total length of the metadata column. */ tsk_size_t metadata_length; tsk_size_t max_metadata_length; tsk_size_t max_metadata_length_increment; tsk_size_t metadata_schema_length; /** @brief The flags column. */ tsk_flags_t *flags; /** @brief The time column. */ double *time; /** @brief The population column. */ tsk_id_t *population; /** @brief The individual column. */ tsk_id_t *individual; /** @brief The metadata column. */ char *metadata; /** @brief The metadata_offset column. */ tsk_size_t *metadata_offset; /** @brief The metadata schema */ char *metadata_schema; } tsk_node_table_t; /** @brief The edge table. @rst See the edge :ref:`table definition ` for details of the columns in this table. @endrst */ typedef struct { /** @brief The number of rows in this table. */ tsk_size_t num_rows; tsk_size_t max_rows; tsk_size_t max_rows_increment; /** @brief The total length of the metadata column. */ tsk_size_t metadata_length; tsk_size_t max_metadata_length; tsk_size_t max_metadata_length_increment; tsk_size_t metadata_schema_length; /** @brief The left column. */ double *left; /** @brief The right column. */ double *right; /** @brief The parent column. */ tsk_id_t *parent; /** @brief The child column. */ tsk_id_t *child; /** @brief The metadata column. */ char *metadata; /** @brief The metadata_offset column. */ tsk_size_t *metadata_offset; /** @brief The metadata schema */ char *metadata_schema; /** @brief Flags for this table */ tsk_flags_t options; } tsk_edge_table_t; /** @brief The migration table. @rst See the migration :ref:`table definition ` for details of the columns in this table. @endrst */ typedef struct { /** @brief The number of rows in this table. */ tsk_size_t num_rows; tsk_size_t max_rows; tsk_size_t max_rows_increment; /** @brief The total length of the metadata column. */ tsk_size_t metadata_length; tsk_size_t max_metadata_length; tsk_size_t max_metadata_length_increment; tsk_size_t metadata_schema_length; /** @brief The source column. */ tsk_id_t *source; /** @brief The dest column. */ tsk_id_t *dest; /** @brief The node column. */ tsk_id_t *node; /** @brief The left column. */ double *left; /** @brief The right column. */ double *right; /** @brief The time column. */ double *time; /** @brief The metadata column. */ char *metadata; /** @brief The metadata_offset column. */ tsk_size_t *metadata_offset; /** @brief The metadata schema */ char *metadata_schema; } tsk_migration_table_t; /** @brief The site table. @rst See the site :ref:`table definition ` for details of the columns in this table. @endrst */ typedef struct { /** @brief The number of rows in this table. */ tsk_size_t num_rows; tsk_size_t max_rows; tsk_size_t max_rows_increment; tsk_size_t ancestral_state_length; tsk_size_t max_ancestral_state_length; tsk_size_t max_ancestral_state_length_increment; /** @brief The total length of the metadata column. */ tsk_size_t metadata_length; tsk_size_t max_metadata_length; tsk_size_t max_metadata_length_increment; tsk_size_t metadata_schema_length; /** @brief The position column. */ double *position; /** @brief The ancestral_state column. */ char *ancestral_state; /** @brief The ancestral_state_offset column. */ tsk_size_t *ancestral_state_offset; /** @brief The metadata column. */ char *metadata; /** @brief The metadata_offset column. */ tsk_size_t *metadata_offset; /** @brief The metadata schema */ char *metadata_schema; } tsk_site_table_t; /** @brief The mutation table. @rst See the mutation :ref:`table definition ` for details of the columns in this table. @endrst */ typedef struct { /** @brief The number of rows in this table. */ tsk_size_t num_rows; tsk_size_t max_rows; tsk_size_t max_rows_increment; tsk_size_t derived_state_length; tsk_size_t max_derived_state_length; tsk_size_t max_derived_state_length_increment; /** @brief The total length of the metadata column. */ tsk_size_t metadata_length; tsk_size_t max_metadata_length; tsk_size_t max_metadata_length_increment; tsk_size_t metadata_schema_length; /** @brief The node column. */ tsk_id_t *node; /** @brief The site column. */ tsk_id_t *site; /** @brief The parent column. */ tsk_id_t *parent; /** @brief The time column. */ double *time; /** @brief The derived_state column. */ char *derived_state; /** @brief The derived_state_offset column. */ tsk_size_t *derived_state_offset; /** @brief The metadata column. */ char *metadata; /** @brief The metadata_offset column. */ tsk_size_t *metadata_offset; /** @brief The metadata schema */ char *metadata_schema; } tsk_mutation_table_t; /** @brief The population table. @rst See the population :ref:`table definition ` for details of the columns in this table. @endrst */ typedef struct { /** @brief The number of rows in this table. */ tsk_size_t num_rows; tsk_size_t max_rows; tsk_size_t max_rows_increment; /** @brief The total length of the metadata column. */ tsk_size_t metadata_length; tsk_size_t max_metadata_length; tsk_size_t max_metadata_length_increment; tsk_size_t metadata_schema_length; /** @brief The metadata column. */ char *metadata; /** @brief The metadata_offset column. */ tsk_size_t *metadata_offset; /** @brief The metadata schema */ char *metadata_schema; } tsk_population_table_t; /** @brief The provenance table. @rst See the provenance :ref:`table definition ` for details of the columns in this table. @endrst */ typedef struct { /** @brief The number of rows in this table. */ tsk_size_t num_rows; tsk_size_t max_rows; tsk_size_t max_rows_increment; /** @brief The total length of the timestamp column. */ tsk_size_t timestamp_length; tsk_size_t max_timestamp_length; tsk_size_t max_timestamp_length_increment; /** @brief The total length of the record column. */ tsk_size_t record_length; tsk_size_t max_record_length; tsk_size_t max_record_length_increment; /** @brief The timestamp column. */ char *timestamp; /** @brief The timestamp_offset column. */ tsk_size_t *timestamp_offset; /** @brief The record column. */ char *record; /** @brief The record_offset column. */ tsk_size_t *record_offset; } tsk_provenance_table_t; typedef struct { char *data; tsk_size_t data_length; char *url; tsk_size_t url_length; char *metadata; tsk_size_t metadata_length; char *metadata_schema; tsk_size_t metadata_schema_length; } tsk_reference_sequence_t; /** @brief A collection of tables defining the data for a tree sequence. */ typedef struct { /** @brief The sequence length defining the tree sequence's coordinate space */ double sequence_length; char *file_uuid; /** @brief The units of the time dimension */ char *time_units; tsk_size_t time_units_length; /** @brief The tree-sequence metadata */ char *metadata; tsk_size_t metadata_length; /** @brief The metadata schema */ char *metadata_schema; tsk_size_t metadata_schema_length; tsk_reference_sequence_t reference_sequence; /** @brief The individual table */ tsk_individual_table_t individuals; /** @brief The node table */ tsk_node_table_t nodes; /** @brief The edge table */ tsk_edge_table_t edges; /** @brief The migration table */ tsk_migration_table_t migrations; /** @brief The site table */ tsk_site_table_t sites; /** @brief The mutation table */ tsk_mutation_table_t mutations; /** @brief The population table */ tsk_population_table_t populations; /** @brief The provenance table */ tsk_provenance_table_t provenances; struct { tsk_id_t *edge_insertion_order; tsk_id_t *edge_removal_order; tsk_size_t num_edges; } indexes; } tsk_table_collection_t; /** @brief A bookmark recording the position of all the tables in a table collection. */ typedef struct { /** @brief The position in the individual table. */ tsk_size_t individuals; /** @brief The position in the node table. */ tsk_size_t nodes; /** @brief The position in the edge table. */ tsk_size_t edges; /** @brief The position in the migration table. */ tsk_size_t migrations; /** @brief The position in the site table. */ tsk_size_t sites; /** @brief The position in the mutation table. */ tsk_size_t mutations; /** @brief The position in the population table. */ tsk_size_t populations; /** @brief The position in the provenance table. */ tsk_size_t provenances; } tsk_bookmark_t; /** @brief Low-level table sorting method. */ typedef struct _tsk_table_sorter_t { /** @brief The input tables that are being sorted. */ tsk_table_collection_t *tables; /** @brief The edge sorting function. If set to NULL, edges are not sorted. */ int (*sort_edges)(struct _tsk_table_sorter_t *self, tsk_size_t start); /** @brief The mutation sorting function. */ int (*sort_mutations)(struct _tsk_table_sorter_t *self); /** @brief The individual sorting function. */ int (*sort_individuals)(struct _tsk_table_sorter_t *self); /** @brief An opaque pointer for use by client code */ void *user_data; /** @brief Mapping from input site IDs to output site IDs */ tsk_id_t *site_id_map; } tsk_table_sorter_t; /* Structs for IBD finding. * TODO: document properly * */ /* Note for tskit developers: it's perhaps a bit confusing/pointless to * have the tsk_identity_segment_t struct as well as the internal tsk_segment_t * struct (which is identical). However, we may want to implement either * segment type differently in future, and since the tsk_identity_segment_t * is part of the public API we want to allow the freedom for the different * structures to evolve over time */ typedef struct _tsk_identity_segment_t { double left; double right; struct _tsk_identity_segment_t *next; tsk_id_t node; } tsk_identity_segment_t; typedef struct { tsk_size_t num_segments; double total_span; tsk_identity_segment_t *head; tsk_identity_segment_t *tail; } tsk_identity_segment_list_t; typedef struct { tsk_size_t num_nodes; tsk_avl_tree_int_t pair_map; tsk_size_t num_segments; double total_span; tsk_blkalloc_t heap; bool store_segments; bool store_pairs; } tsk_identity_segments_t; /* Diff iterator. */ typedef struct _tsk_edge_list_node_t { tsk_edge_t edge; struct _tsk_edge_list_node_t *next; struct _tsk_edge_list_node_t *prev; } tsk_edge_list_node_t; typedef struct { tsk_edge_list_node_t *head; tsk_edge_list_node_t *tail; } tsk_edge_list_t; /****************************************************************************/ /* Common function options */ /****************************************************************************/ /** @defgroup API_FLAGS_SIMPLIFY_GROUP :c:func:`tsk_table_collection_simplify` and :c:func:`tsk_treeseq_simplify` specific flags. @{ */ /** Remove sites from the output if there are no mutations that reference them.*/ #define TSK_SIMPLIFY_FILTER_SITES (1 << 0) /** Remove populations from the output if there are no nodes or migrations that reference them. */ #define TSK_SIMPLIFY_FILTER_POPULATIONS (1 << 1) /** Remove individuals from the output if there are no nodes that reference them.*/ #define TSK_SIMPLIFY_FILTER_INDIVIDUALS (1 << 2) /** Do not remove nodes from the output if there are no edges that reference them and do not reorder nodes so that the samples are nodes 0 to num_samples - 1. Note that this flag is negated compared to other filtering options because the default behaviour is to filter unreferenced nodes and reorder to put samples first. */ #define TSK_SIMPLIFY_NO_FILTER_NODES (1 << 7) /** Do not update the sample status of nodes as a result of simplification. */ #define TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS (1 << 8) /** Reduce the topological information in the tables to the minimum necessary to represent the trees that contain sites. If there are zero sites this will result in an zero output edges. When the number of sites is greater than zero, every tree in the output tree sequence will contain at least one site. For a given site, the topology of the tree containing that site will be identical (up to node ID remapping) to the topology of the corresponding tree in the input. */ #define TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY (1 << 3) /** By default simplify removes unary nodes (i.e., nodes with exactly one child) along the path from samples to root. If this option is specified such unary nodes will be preserved in the output. */ #define TSK_SIMPLIFY_KEEP_UNARY (1 << 4) /** By default simplify removes all topology ancestral the MRCAs of the samples. This option inserts edges from these MRCAs back to the roots of the input trees. */ #define TSK_SIMPLIFY_KEEP_INPUT_ROOTS (1 << 5) /** @rst This acts like :c:macro:`TSK_SIMPLIFY_KEEP_UNARY` (and is mutually exclusive with that flag). It keeps unary nodes, but only if the unary node is referenced from an individual. @endrst */ #define TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS (1 << 6) /** @} */ /** @defgroup API_FLAGS_SUBSET_GROUP :c:func:`tsk_table_collection_subset` specific flags. @{ */ /**If this flag is provided, the population table will not be changed in any way.*/ #define TSK_SUBSET_NO_CHANGE_POPULATIONS (1 << 0) /** @rst If this flag is provided, then unreferenced sites, individuals, and populations will not be removed. If so, the site and individual tables will not be changed, and (unless :c:macro:`TSK_SUBSET_NO_CHANGE_POPULATIONS` is also provided) unreferenced populations will be placed last, in their original order. @endrst */ #define TSK_SUBSET_KEEP_UNREFERENCED (1 << 1) /** @} */ /** @defgroup API_FLAGS_CHECK_INTEGRITY_GROUP :c:func:`tsk_table_collection_check_integrity` specific flags. @{ */ /** Check edge ordering constraints for a tree sequence. */ #define TSK_CHECK_EDGE_ORDERING (1 << 0) /** Check that sites are in non-decreasing position order. */ #define TSK_CHECK_SITE_ORDERING (1 << 1) /**Check for any duplicate site positions. */ #define TSK_CHECK_SITE_DUPLICATES (1 << 2) /** Check constraints on the ordering of mutations. Any non-null mutation parents and known times are checked for ordering constraints. */ #define TSK_CHECK_MUTATION_ORDERING (1 << 3) /**Check individual parents are before children, where specified. */ #define TSK_CHECK_INDIVIDUAL_ORDERING (1 << 4) /**Check migrations are ordered by time. */ #define TSK_CHECK_MIGRATION_ORDERING (1 << 5) /**Check that the table indexes exist, and contain valid edge references. */ #define TSK_CHECK_INDEXES (1 << 6) /** All checks needed to define a valid tree sequence. Note that this implies all of the above checks. */ #define TSK_CHECK_TREES (1 << 7) /** Check mutation parents are consistent with topology. Implies TSK_CHECK_TREES. */ #define TSK_CHECK_MUTATION_PARENTS (1 << 8) /* Leave room for more positive check flags */ /** Do not check integrity of references to populations. This can be safely combined with the other checks. */ #define TSK_NO_CHECK_POPULATION_REFS (1 << 12) /** @} */ /** @defgroup API_FLAGS_LOAD_INIT_GROUP Flags used by load and init methods. @{ */ /* These flags are for table collection load or init, or used as flags on table collection or individual tables. * As flags are passed though from load to init they share a namespace */ /** Skip reading tables, and only load top-level information. */ #define TSK_LOAD_SKIP_TABLES (1 << 0) /** Do not load reference sequence. */ #define TSK_LOAD_SKIP_REFERENCE_SEQUENCE (1 << 1) /** @rst Do not allocate space to store metadata in this table. Operations attempting to add non-empty metadata to the table will fail with error TSK_ERR_METADATA_DISABLED. @endrst */ #define TSK_TABLE_NO_METADATA (1 << 2) /** @rst Do not allocate space to store metadata in the edge table. Operations attempting to add non-empty metadata to the edge table will fail with error TSK_ERR_METADATA_DISABLED. @endrst */ #define TSK_TC_NO_EDGE_METADATA (1 << 3) /** @} */ /* Flags for dump tables */ /* We may not want to document this flag, but it's useful for testing * so we put it high up in the bit space, below the common options */ #define TSK_DUMP_FORCE_OFFSET_64 (1 << 27) /** @defgroup API_FLAGS_COPY_GROUP Flags used by :c:func:`tsk_table_collection_copy`. @{ */ /** Copy the file uuid, by default this is not copied. */ #define TSK_COPY_FILE_UUID (1 << 0) /** @} */ /** @defgroup API_FLAGS_UNION_GROUP Flags used by :c:func:`tsk_table_collection_union`. @{ */ /** By default, union checks that the portion of shared history between ``self`` and ``other``, as implied by ``other_node_mapping``, are indeed equivalent. It does so by subsetting both ``self`` and ``other`` on the equivalent nodes specified in ``other_node_mapping``, and then checking for equality of the subsets. */ #define TSK_UNION_NO_CHECK_SHARED (1 << 0) /** By default, all nodes new to ``self`` are assigned new populations. If this option is specified, nodes that are added to ``self`` will retain the population IDs they have in ``other``. */ #define TSK_UNION_NO_ADD_POP (1 << 1) /** By default, union only adds edges adjacent to a newly added node; this option adds all edges. */ #define TSK_UNION_ALL_EDGES (1 << 2) /** By default, union only adds only mutations on newly added edges, and sites for those mutations; this option adds all mutations and all sites. */ #define TSK_UNION_ALL_MUTATIONS (1 << 3) /** @} */ /** @defgroup API_FLAGS_CMP_GROUP Flags used by :c:func:`tsk_table_collection_equals`. @{ */ /** Do not include the top-level tree sequence metadata and metadata schemas in the comparison. */ #define TSK_CMP_IGNORE_TS_METADATA (1 << 0) /** Do not include the provenance table in comparison. */ #define TSK_CMP_IGNORE_PROVENANCE (1 << 1) /** @rst Do not include metadata when comparing the table collections. This includes both the top-level tree sequence metadata as well as the metadata for each of the tables (i.e, :c:macro:`TSK_CMP_IGNORE_TS_METADATA` is implied). All metadata schemas are also ignored. @endrst */ #define TSK_CMP_IGNORE_METADATA (1 << 2) /** @rst Do not include the timestamp information when comparing the provenance tables. This has no effect if :c:macro:`TSK_CMP_IGNORE_PROVENANCE` is specified. @endrst */ #define TSK_CMP_IGNORE_TIMESTAMPS (1 << 3) /** Do not include any tables in the comparison, thus comparing only the top-level information of the table collections being compared. */ #define TSK_CMP_IGNORE_TABLES (1 << 4) /** Do not include the reference sequence in the comparison. */ #define TSK_CMP_IGNORE_REFERENCE_SEQUENCE (1 << 5) /** @} */ /** @defgroup API_FLAGS_CLEAR_GROUP Flags used by :c:func:`tsk_table_collection_clear`. @{ */ /** Additionally clear the table metadata schemas*/ #define TSK_CLEAR_METADATA_SCHEMAS (1 << 0) /** Additionally clear the tree-sequence metadata and schema*/ #define TSK_CLEAR_TS_METADATA_AND_SCHEMA (1 << 1) /** Additionally clear the provenance table*/ #define TSK_CLEAR_PROVENANCE (1 << 2) /** @} */ /* For the edge diff iterator */ #define TSK_INCLUDE_TERMINAL (1 << 0) /** @brief Value returned by seeking methods when they have successfully seeked to a non-null tree. @ingroup TREE_API_SEEKING_GROUP */ #define TSK_TREE_OK 1 /****************************************************************************/ /* Function signatures */ /****************************************************************************/ /** @defgroup INDIVIDUAL_TABLE_API_GROUP Individual table API. @{ */ /** @brief Initialises the table by allocating the internal memory. @rst This must be called before any operations are performed on the table. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. @endrst @param self A pointer to an uninitialised tsk_individual_table_t object. @param options Allocation time options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_init(tsk_individual_table_t *self, tsk_flags_t options); /** @brief Free the internal memory for the specified table. @param self A pointer to an initialised tsk_individual_table_t object. @return Always returns 0. */ int tsk_individual_table_free(tsk_individual_table_t *self); /** @brief Adds a row to this individual table. @rst Add a new individual with the specified ``flags``, ``location``, ``parents`` and ``metadata`` to the table. Copies of the ``location``, ``parents`` and ``metadata`` parameters are taken immediately. See the :ref:`table definition ` for details of the columns in this table. @endrst @param self A pointer to a tsk_individual_table_t object. @param flags The bitwise flags for the new individual. @param location A pointer to a double array representing the spatial location of the new individual. Can be ``NULL`` if ``location_length`` is 0. @param location_length The number of dimensions in the locations position. Note this the number of elements in the corresponding double array not the number of bytes. @param parents A pointer to a ``tsk_id`` array representing the parents of the new individual. Can be ``NULL`` if ``parents_length`` is 0. @param parents_length The number of parents. Note this the number of elements in the corresponding ``tsk_id`` array not the number of bytes. @param metadata The metadata to be associated with the new individual. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return the ID of the newly added individual on success, or a negative value on failure. */ tsk_id_t tsk_individual_table_add_row(tsk_individual_table_t *self, tsk_flags_t flags, const double *location, tsk_size_t location_length, const tsk_id_t *parents, tsk_size_t parents_length, const char *metadata, tsk_size_t metadata_length); /** @brief Updates the row at the specified index. @rst Rewrite the row at the specified index in this table to use the specified values. Copies of the ``location``, ``parents`` and ``metadata`` parameters are taken immediately. See the :ref:`table definition ` for details of the columns in this table. .. warning:: Because of the way that ragged columns are encoded, this method requires a full rewrite of the internal column memory in worst case, and would therefore be inefficient for bulk updates for such columns. However, if the sizes of all ragged column values are unchanged in the updated row, this method is guaranteed to only update the memory for the row in question. @endrst @param self A pointer to a tsk_individual_table_t object. @param index The row to update. @param flags The bitwise flags for the individual. @param location A pointer to a double array representing the spatial location of the new individual. Can be ``NULL`` if ``location_length`` is 0. @param location_length The number of dimensions in the locations position. Note this the number of elements in the corresponding double array not the number of bytes. @param parents A pointer to a ``tsk_id`` array representing the parents of the new individual. Can be ``NULL`` if ``parents_length`` is 0. @param parents_length The number of parents. Note this the number of elements in the corresponding ``tsk_id`` array not the number of bytes. @param metadata The metadata to be associated with the new individual. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_update_row(tsk_individual_table_t *self, tsk_id_t index, tsk_flags_t flags, const double *location, tsk_size_t location_length, const tsk_id_t *parents, tsk_size_t parents_length, const char *metadata, tsk_size_t metadata_length); /** @brief Clears this table, setting the number of rows to zero. @rst No memory is freed as a result of this operation; please use :c:func:`tsk_individual_table_free` to free the table's internal resources. Note that the metadata schema is not cleared. @endrst @param self A pointer to a tsk_individual_table_t object. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_clear(tsk_individual_table_t *self); /** @brief Truncates this table so that only the first num_rows are retained. @param self A pointer to a tsk_individual_table_t object. @param num_rows The number of rows to retain in the table. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_truncate(tsk_individual_table_t *self, tsk_size_t num_rows); /** @brief Extends this table by appending rows copied from another table. @rst Appends the rows at the specified indexes from the table ``other`` to the end of this table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is and is not checked for compatibility with any existing schema on this table. @endrst @param self A pointer to a tsk_individual_table_t object where rows are to be added. @param other A pointer to a tsk_individual_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_extend(tsk_individual_table_t *self, const tsk_individual_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); /** @brief Subset this table by keeping rows according to a boolean mask. @rst Deletes rows from this table and optionally return the mapping from IDs in the current table to the updated table. Rows are kept or deleted according to the specified boolean array ``keep`` such that for each row ``j`` if ``keep[j]`` is false (zero) the row is deleted, and otherwise the row is retained. Thus, ``keep`` must be an array of at least ``num_rows`` :c:type:`bool` values. If the ``id_map`` argument is non-null, this array will be updated to represent the mapping between IDs before and after row deletion. For row ``j``, ``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or :c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an array of at least ``num_rows`` :c:type:`tsk_id_t` values. The values in the ``parents`` column are updated according to this map, so that reference integrity within the table is maintained. As a consequence of this, the values in the ``parents`` column for kept rows are bounds-checked and an error raised if they are not valid. Rows that are deleted are not checked for parent ID integrity. If an attempt is made to delete rows that are referred to by the ``parents`` column of rows that are retained, an error is raised. These error conditions are checked before any alterations to the table are made. .. warning:: C++ users need to be careful to specify the correct type when passing in values for the ``keep`` array, using ``std::vector`` and not ``std::vector``, as the latter may not be correct size. @endrst @param self A pointer to a tsk_individual_table_t object. @param keep Array of boolean flags describing whether a particular row should be kept or not. Must be at least ``num_rows`` long. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @param id_map An array in which to store the mapping between new and old IDs. If NULL, this will be ignored. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_keep_rows(tsk_individual_table_t *self, const tsk_bool_t *keep, tsk_flags_t options, tsk_id_t *id_map); /** @brief Returns true if the data in the specified table is identical to the data in this table. @rst **Options** Options to control the comparison can be specified by providing one or more of the following bitwise flags. By default (options=0) tables are considered equal if they are byte-wise identical in all columns, and their metadata schemas are byte-wise identical. - :c:macro:`TSK_CMP_IGNORE_METADATA` @endrst @param self A pointer to a tsk_individual_table_t object. @param other A pointer to a tsk_individual_table_t object. @param options Bitwise comparison options. @return Return true if the specified table is equal to this table. */ bool tsk_individual_table_equals(const tsk_individual_table_t *self, const tsk_individual_table_t *other, tsk_flags_t options); /** @brief Copies the state of this table into the specified destination. @rst By default the method initialises the specified destination table. If the destination is already initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid leaking memory. Indexes that are present are also copied to the destination table. @endrst @param self A pointer to a tsk_individual_table_t object. @param dest A pointer to a tsk_individual_table_t object. If the TSK_NO_INIT option is specified, this must be an initialised individual table. If not, it must be an uninitialised individual table. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_copy(const tsk_individual_table_t *self, tsk_individual_table_t *dest, tsk_flags_t options); /** @brief Get the row at the specified index. @rst Updates the specified individual struct to reflect the values in the specified row. Pointers to memory within this struct are handled by the table and should **not** be freed by client code. These pointers are guaranteed to be valid until the next operation that modifies the table (e.g., by adding a new row), but not afterwards. @endrst @param self A pointer to a tsk_individual_table_t object. @param index The requested table row. @param row A pointer to a tsk_individual_t struct that is updated to reflect the values in the specified row. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_get_row( const tsk_individual_table_t *self, tsk_id_t index, tsk_individual_t *row); /** @brief Set the metadata schema @rst Copies the metadata schema string to this table, replacing any existing. @endrst @param self A pointer to a tsk_individual_table_t object. @param metadata_schema A pointer to a char array. @param metadata_schema_length The size of the metadata schema in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_set_metadata_schema(tsk_individual_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length); /** @brief Print out the state of this table to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_individual_table_t object. @param out The stream to write the summary to. */ void tsk_individual_table_print_state(const tsk_individual_table_t *self, FILE *out); /** @brief Replace this table's data by copying from a set of column arrays @rst Clears the data columns of this table and then copies column data from the specified set of arrays. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_individual_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param flags The array of tsk_flag_t flag values to be copied. @param location The array of double location values to be copied. @param location_offset The array of tsk_size_t location offset values to be copied. @param parents The array of tsk_id_t parent values to be copied. @param parents_offset The array of tsk_size_t parent offset values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_set_columns(tsk_individual_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *location, const tsk_size_t *location_offset, const tsk_id_t *parents, const tsk_size_t *parents_offset, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Extends this table by copying from a set of column arrays @rst Copies column data from the specified set of arrays to create new rows at the end of the table. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_individual_table_t object. @param num_rows The number of rows to copy from the specifed arrays @param flags The array of tsk_flag_t flag values to be copied. @param location The array of double location values to be copied. @param location_offset The array of tsk_size_t location offset values to be copied. @param parents The array of tsk_id_t parent values to be copied. @param parents_offset The array of tsk_size_t parent offset values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_append_columns(tsk_individual_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *location, const tsk_size_t *location_offset, const tsk_id_t *parents, const tsk_size_t *parents_offset, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Controls the pre-allocation strategy for this table @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_individual_table_t object. @param max_rows_increment The number of rows to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_set_max_rows_increment( tsk_individual_table_t *self, tsk_size_t max_rows_increment); /** @brief Controls the pre-allocation strategy for the metadata column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_individual_table_t object. @param max_metadata_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_set_max_metadata_length_increment( tsk_individual_table_t *self, tsk_size_t max_metadata_length_increment); /** @brief Controls the pre-allocation strategy for the location column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_individual_table_t object. @param max_location_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_set_max_location_length_increment( tsk_individual_table_t *self, tsk_size_t max_location_length_increment); /** @brief Controls the pre-allocation strategy for the parents column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_individual_table_t object. @param max_parents_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_individual_table_set_max_parents_length_increment( tsk_individual_table_t *self, tsk_size_t max_parents_length_increment); /** @} */ /* Undocumented methods */ int tsk_individual_table_dump_text(const tsk_individual_table_t *self, FILE *out); /** @defgroup NODE_TABLE_API_GROUP Node table API. @{ */ /** @brief Initialises the table by allocating the internal memory. @rst This must be called before any operations are performed on the table. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. @endrst @param self A pointer to an uninitialised tsk_node_table_t object. @param options Allocation time options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_init(tsk_node_table_t *self, tsk_flags_t options); /** @brief Free the internal memory for the specified table. @param self A pointer to an initialised tsk_node_table_t object. @return Always returns 0. */ int tsk_node_table_free(tsk_node_table_t *self); /** @brief Adds a row to this node table. @rst Add a new node with the specified ``flags``, ``time``, ``population``, ``individual`` and ``metadata`` to the table. A copy of the ``metadata`` parameter is taken immediately. See the :ref:`table definition ` for details of the columns in this table. @endrst @param self A pointer to a tsk_node_table_t object. @param flags The bitwise flags for the new node. @param time The time for the new node. @param population The population for the new node. Set to TSK_NULL if not known. @param individual The individual for the new node. Set to TSK_NULL if not known. @param metadata The metadata to be associated with the new node. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return the ID of the newly added node on success, or a negative value on failure. */ tsk_id_t tsk_node_table_add_row(tsk_node_table_t *self, tsk_flags_t flags, double time, tsk_id_t population, tsk_id_t individual, const char *metadata, tsk_size_t metadata_length); /** @brief Updates the row at the specified index. @rst Rewrite the row at the specified index in this table to use the specified values. A copy of the ``metadata`` parameter is taken immediately. See the :ref:`table definition ` for details of the columns in this table. .. warning:: Because of the way that ragged columns are encoded, this method requires a full rewrite of the internal column memory in worst case, and would therefore be inefficient for bulk updates for such columns. However, if the sizes of all ragged column values are unchanged in the updated row, this method is guaranteed to only update the memory for the row in question. @endrst @param self A pointer to a tsk_node_table_t object. @param index The row to update. @param flags The bitwise flags for the node. @param time The time for the node. @param population The population for the node. Set to TSK_NULL if not known. @param individual The individual for the node. Set to TSK_NULL if not known. @param metadata The metadata to be associated with the node. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_update_row(tsk_node_table_t *self, tsk_id_t index, tsk_flags_t flags, double time, tsk_id_t population, tsk_id_t individual, const char *metadata, tsk_size_t metadata_length); /** @brief Clears this table, setting the number of rows to zero. @rst No memory is freed as a result of this operation; please use :c:func:`tsk_node_table_free` to free the table's internal resources. Note that the metadata schema is not cleared. @endrst @param self A pointer to a tsk_node_table_t object. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_clear(tsk_node_table_t *self); /** @brief Truncates this table so that only the first num_rows are retained. @param self A pointer to a tsk_node_table_t object. @param num_rows The number of rows to retain in the table. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_truncate(tsk_node_table_t *self, tsk_size_t num_rows); /** @brief Extends this table by appending rows copied from another table. @rst Appends the rows at the specified indexes from the table ``other`` to the end of this table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is and is not checked for compatibility with any existing schema on this table. @endrst @param self A pointer to a tsk_node_table_t object where rows are to be added. @param other A pointer to a tsk_node_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_extend(tsk_node_table_t *self, const tsk_node_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); /** @brief Subset this table by keeping rows according to a boolean mask. @rst Deletes rows from this table and optionally return the mapping from IDs in the current table to the updated table. Rows are kept or deleted according to the specified boolean array ``keep`` such that for each row ``j`` if ``keep[j]`` is false (zero) the row is deleted, and otherwise the row is retained. Thus, ``keep`` must be an array of at least ``num_rows`` :c:type:`bool` values. If the ``id_map`` argument is non-null, this array will be updated to represent the mapping between IDs before and after row deletion. For row ``j``, ``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or :c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an array of at least ``num_rows`` :c:type:`tsk_id_t` values. .. warning:: C++ users need to be careful to specify the correct type when passing in values for the ``keep`` array, using ``std::vector`` and not ``std::vector``, as the latter may not be correct size. @endrst @param self A pointer to a tsk_node_table_t object. @param keep Array of boolean flags describing whether a particular row should be kept or not. Must be at least ``num_rows`` long. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @param id_map An array in which to store the mapping between new and old IDs. If NULL, this will be ignored. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_keep_rows(tsk_node_table_t *self, const tsk_bool_t *keep, tsk_flags_t options, tsk_id_t *id_map); /** @brief Returns true if the data in the specified table is identical to the data in this table. @rst **Options** Options to control the comparison can be specified by providing one or more of the following bitwise flags. By default (options=0) tables are considered equal if they are byte-wise identical in all columns, and their metadata schemas are byte-wise identical. - :c:macro:`TSK_CMP_IGNORE_METADATA` @endrst @param self A pointer to a tsk_node_table_t object. @param other A pointer to a tsk_node_table_t object. @param options Bitwise comparison options. @return Return true if the specified table is equal to this table. */ bool tsk_node_table_equals( const tsk_node_table_t *self, const tsk_node_table_t *other, tsk_flags_t options); /** @brief Copies the state of this table into the specified destination. @rst By default the method initialises the specified destination table. If the destination is already initialised, the TSK_NO_INIT option should be supplied to avoid leaking memory. @endrst @param self A pointer to a tsk_node_table_t object. @param dest A pointer to a tsk_node_table_t object. If the TSK_NO_INIT option is specified, this must be an initialised node table. If not, it must be an uninitialised node table. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_copy( const tsk_node_table_t *self, tsk_node_table_t *dest, tsk_flags_t options); /** @brief Get the row at the specified index. @rst Updates the specified node struct to reflect the values in the specified row. Pointers to memory within this struct are handled by the table and should **not** be freed by client code. These pointers are guaranteed to be valid until the next operation that modifies the table (e.g., by adding a new row), but not afterwards. @endrst @param self A pointer to a tsk_node_table_t object. @param index The requested table row. @param row A pointer to a tsk_node_t struct that is updated to reflect the values in the specified row. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_get_row( const tsk_node_table_t *self, tsk_id_t index, tsk_node_t *row); /** @brief Set the metadata schema @rst Copies the metadata schema string to this table, replacing any existing. @endrst @param self A pointer to a tsk_node_table_t object. @param metadata_schema A pointer to a char array. @param metadata_schema_length The size of the metadata schema in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_set_metadata_schema(tsk_node_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length); /** @brief Print out the state of this table to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_node_table_t object. @param out The stream to write the summary to. */ void tsk_node_table_print_state(const tsk_node_table_t *self, FILE *out); /** @brief Replace this table's data by copying from a set of column arrays @rst Clears the data columns of this table and then copies column data from the specified set of arrays. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_node_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param flags The array of tsk_flag_t values to be copied. @param time The array of double time values to be copied. @param population The array of tsk_id_t population values to be copied. @param individual The array of tsk_id_t individual values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_set_columns(tsk_node_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *time, const tsk_id_t *population, const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Extends this table by copying from a set of column arrays @rst Copies column data from the specified set of arrays to create new rows at the end of the table. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_node_table_t object. @param num_rows The number of rows to copy from the specifed arrays @param flags The array of tsk_flag_t values to be copied. @param time The array of double time values to be copied. @param population The array of tsk_id_t population values to be copied. @param individual The array of tsk_id_t individual values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_append_columns(tsk_node_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *time, const tsk_id_t *population, const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Controls the pre-allocation strategy for this table @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_node_table_t object. @param max_rows_increment The number of rows to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_set_max_rows_increment( tsk_node_table_t *self, tsk_size_t max_rows_increment); /** @brief Controls the pre-allocation strategy for the metadata column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_node_table_t object. @param max_metadata_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_node_table_set_max_metadata_length_increment( tsk_node_table_t *self, tsk_size_t max_metadata_length_increment); /** @} */ /* Undocumented methods */ int tsk_node_table_dump_text(const tsk_node_table_t *self, FILE *out); /** @defgroup EDGE_TABLE_API_GROUP Edge table API. @{ */ /** @brief Initialises the table by allocating the internal memory. @rst This must be called before any operations are performed on the table. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. **Options** Options can be specified by providing one or more of the following bitwise flags: - :c:macro:`TSK_TABLE_NO_METADATA` @endrst @param self A pointer to an uninitialised tsk_edge_table_t object. @param options Allocation time options. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_init(tsk_edge_table_t *self, tsk_flags_t options); /** @brief Free the internal memory for the specified table. @param self A pointer to an initialised tsk_edge_table_t object. @return Always returns 0. */ int tsk_edge_table_free(tsk_edge_table_t *self); /** @brief Adds a row to this edge table. @rst Add a new edge with the specified ``left``, ``right``, ``parent``, ``child`` and ``metadata`` to the table. See the :ref:`table definition ` for details of the columns in this table. @endrst @param self A pointer to a tsk_edge_table_t object. @param left The left coordinate for the new edge. @param right The right coordinate for the new edge. @param parent The parent node for the new edge. @param child The child node for the new edge. @param metadata The metadata to be associated with the new edge. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return the ID of the newly added edge on success, or a negative value on failure. */ tsk_id_t tsk_edge_table_add_row(tsk_edge_table_t *self, double left, double right, tsk_id_t parent, tsk_id_t child, const char *metadata, tsk_size_t metadata_length); /** @brief Updates the row at the specified index. @rst Rewrite the row at the specified index in this table to use the specified values. A copy of the ``metadata`` parameter is taken immediately. See the :ref:`table definition ` for details of the columns in this table. .. warning:: Because of the way that ragged columns are encoded, this method requires a full rewrite of the internal column memory in worst case, and would therefore be inefficient for bulk updates for such columns. However, if the sizes of all ragged column values are unchanged in the updated row, this method is guaranteed to only update the memory for the row in question. @endrst @param self A pointer to a tsk_edge_table_t object. @param index The row to update. @param left The left coordinate for the edge. @param right The right coordinate for the edge. @param parent The parent node for the edge. @param child The child node for the edge. @param metadata The metadata to be associated with the edge. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_update_row(tsk_edge_table_t *self, tsk_id_t index, double left, double right, tsk_id_t parent, tsk_id_t child, const char *metadata, tsk_size_t metadata_length); /** @brief Clears this table, setting the number of rows to zero. @rst No memory is freed as a result of this operation; please use :c:func:`tsk_edge_table_free` to free the table's internal resources. Note that the metadata schema is not cleared. @endrst @param self A pointer to a tsk_edge_table_t object. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_clear(tsk_edge_table_t *self); /** @brief Truncates this table so that only the first num_rows are retained. @param self A pointer to a tsk_edge_table_t object. @param num_rows The number of rows to retain in the table. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_truncate(tsk_edge_table_t *self, tsk_size_t num_rows); /** @brief Extends this table by appending rows copied from another table. @rst Appends the rows at the specified indexes from the table ``other`` to the end of this table. Row indexes can be repeated and in any order. If ``row_indexes`` is ``NULL``, append the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is and is not checked for compatibility with any existing schema on this table. @endrst @param self A pointer to a tsk_edge_table_t object where rows are to be added. @param other A pointer to a tsk_edge_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_extend(tsk_edge_table_t *self, const tsk_edge_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); /** @brief Subset this table by keeping rows according to a boolean mask. @rst Deletes rows from this table and optionally return the mapping from IDs in the current table to the updated table. Rows are kept or deleted according to the specified boolean array ``keep`` such that for each row ``j`` if ``keep[j]`` is false (zero) the row is deleted, and otherwise the row is retained. Thus, ``keep`` must be an array of at least ``num_rows`` :c:type:`bool` values. If the ``id_map`` argument is non-null, this array will be updated to represent the mapping between IDs before and after row deletion. For row ``j``, ``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or :c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an array of at least ``num_rows`` :c:type:`tsk_id_t` values. .. warning:: C++ users need to be careful to specify the correct type when passing in values for the ``keep`` array, using ``std::vector`` and not ``std::vector``, as the latter may not be correct size. @endrst @param self A pointer to a tsk_edge_table_t object. @param keep Array of boolean flags describing whether a particular row should be kept or not. Must be at least ``num_rows`` long. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @param id_map An array in which to store the mapping between new and old IDs. If NULL, this will be ignored. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_keep_rows(tsk_edge_table_t *self, const tsk_bool_t *keep, tsk_flags_t options, tsk_id_t *id_map); /** @brief Returns true if the data in the specified table is identical to the data in this table. @rst **Options** Options to control the comparison can be specified by providing one or more of the following bitwise flags. By default (options=0) tables are considered equal if they are byte-wise identical in all columns, and their metadata schemas are byte-wise identical. - :c:macro:`TSK_CMP_IGNORE_METADATA` @endrst @param self A pointer to a tsk_edge_table_t object. @param other A pointer to a tsk_edge_table_t object. @param options Bitwise comparison options. @return Return true if the specified table is equal to this table. */ bool tsk_edge_table_equals( const tsk_edge_table_t *self, const tsk_edge_table_t *other, tsk_flags_t options); /** @brief Copies the state of this table into the specified destination. @rst By default the method initialises the specified destination table. If the destination is already initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid leaking memory. @endrst @param self A pointer to a tsk_edge_table_t object. @param dest A pointer to a tsk_edge_table_t object. If the TSK_NO_INIT option is specified, this must be an initialised edge table. If not, it must be an uninitialised edge table. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_copy( const tsk_edge_table_t *self, tsk_edge_table_t *dest, tsk_flags_t options); /** @brief Get the row at the specified index. @rst Updates the specified edge struct to reflect the values in the specified row. Pointers to memory within this struct are handled by the table and should **not** be freed by client code. These pointers are guaranteed to be valid until the next operation that modifies the table (e.g., by adding a new row), but not afterwards. @endrst @param self A pointer to a tsk_edge_table_t object. @param index The requested table row. @param row A pointer to a tsk_edge_t struct that is updated to reflect the values in the specified row. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_get_row( const tsk_edge_table_t *self, tsk_id_t index, tsk_edge_t *row); /** @brief Set the metadata schema @rst Copies the metadata schema string to this table, replacing any existing. @endrst @param self A pointer to a tsk_edge_table_t object. @param metadata_schema A pointer to a char array @param metadata_schema_length The size of the metadata schema in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_set_metadata_schema(tsk_edge_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length); /** @brief Print out the state of this table to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_edge_table_t object. @param out The stream to write the summary to. */ void tsk_edge_table_print_state(const tsk_edge_table_t *self, FILE *out); /** @brief Replace this table's data by copying from a set of column arrays @rst Clears the data columns of this table and then copies column data from the specified set of arrays. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_edge_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param left The array of double left values to be copied. @param right The array of double right values to be copied. @param parent The array of tsk_id_t parent values to be copied. @param child The array of tsk_id_t child values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_set_columns(tsk_edge_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *parent, const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Extends this table by copying from a set of column arrays @rst Copies column data from the specified set of arrays to create new rows at the end of the table. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_edge_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param left The array of double left values to be copied. @param right The array of double right values to be copied. @param parent The array of tsk_id_t parent values to be copied. @param child The array of tsk_id_t child values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. */ int tsk_edge_table_append_columns(tsk_edge_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *parent, const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Controls the pre-allocation strategy for this table @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_edge_table_t object. @param max_rows_increment The number of rows to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_set_max_rows_increment( tsk_edge_table_t *self, tsk_size_t max_rows_increment); /** @brief Controls the pre-allocation strategy for the metadata column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_edge_table_t object. @param max_metadata_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_set_max_metadata_length_increment( tsk_edge_table_t *self, tsk_size_t max_metadata_length_increment); /** @brief Squash adjacent edges in-place @rst Sorts, then condenses the table into the smallest possible number of rows by combining any adjacent edges. A pair of edges is said to be `adjacent` if they have the same parent and child nodes, and if the left coordinate of one of the edges is equal to the right coordinate of the other edge. This process is performed in-place so that any set of adjacent edges is replaced by a single edge. The new edge will have the same parent and child node, a left coordinate equal to the smallest left coordinate in the set, and a right coordinate equal to the largest right coordinate in the set. The new edge table will be sorted in the canonical order (P, C, L, R). .. note:: Note that this method will fail if any edges have non-empty metadata. @endrst @param self A pointer to a tsk_edge_table_t object. @return Return 0 on success or a negative value on failure. */ int tsk_edge_table_squash(tsk_edge_table_t *self); /** @} */ /* Undocumented methods */ int tsk_edge_table_dump_text(const tsk_edge_table_t *self, FILE *out); /** @defgroup MIGRATION_TABLE_API_GROUP Migration table API. @{ */ /** @brief Initialises the table by allocating the internal memory. @rst This must be called before any operations are performed on the table. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. @endrst @param self A pointer to an uninitialised tsk_migration_table_t object. @param options Allocation time options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_init(tsk_migration_table_t *self, tsk_flags_t options); /** @brief Free the internal memory for the specified table. @param self A pointer to an initialised tsk_migration_table_t object. @return Always returns 0. */ int tsk_migration_table_free(tsk_migration_table_t *self); /** @brief Adds a row to this migration table. @rst Add a new migration with the specified ``left``, ``right``, ``node``, ``source``, ``dest``, ``time`` and ``metadata`` to the table. See the :ref:`table definition ` for details of the columns in this table. @endrst @param self A pointer to a tsk_migration_table_t object. @param left The left coordinate for the new migration. @param right The right coordinate for the new migration. @param node The node ID for the new migration. @param source The source population ID for the new migration. @param dest The destination population ID for the new migration. @param time The time for the new migration. @param metadata The metadata to be associated with the new migration. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return the ID of the newly added migration on success, or a negative value on failure. */ tsk_id_t tsk_migration_table_add_row(tsk_migration_table_t *self, double left, double right, tsk_id_t node, tsk_id_t source, tsk_id_t dest, double time, const char *metadata, tsk_size_t metadata_length); /** @brief Updates the row at the specified index. @rst Rewrite the row at the specified index in this table to use the specified values. A copy of the ``metadata`` parameter is taken immediately. See the :ref:`table definition ` for details of the columns in this table. .. warning:: Because of the way that ragged columns are encoded, this method requires a full rewrite of the internal column memory in worst case, and would therefore be inefficient for bulk updates for such columns. However, if the sizes of all ragged column values are unchanged in the updated row, this method is guaranteed to only update the memory for the row in question. @endrst @param self A pointer to a tsk_migration_table_t object. @param index The row to update. @param left The left coordinate for the migration. @param right The right coordinate for the migration. @param node The node ID for the migration. @param source The source population ID for the migration. @param dest The destination population ID for the migration. @param time The time for the migration. @param metadata The metadata to be associated with the migration. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_update_row(tsk_migration_table_t *self, tsk_id_t index, double left, double right, tsk_id_t node, tsk_id_t source, tsk_id_t dest, double time, const char *metadata, tsk_size_t metadata_length); /** @brief Clears this table, setting the number of rows to zero. @rst No memory is freed as a result of this operation; please use :c:func:`tsk_migration_table_free` to free the table's internal resources. Note that the metadata schema is not cleared. @endrst @param self A pointer to a tsk_migration_table_t object. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_clear(tsk_migration_table_t *self); /** @brief Truncates this table so that only the first num_rows are retained. @param self A pointer to a tsk_migration_table_t object. @param num_rows The number of rows to retain in the table. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_truncate(tsk_migration_table_t *self, tsk_size_t num_rows); /** @brief Extends this table by appending rows copied from another table. @rst Appends the rows at the specified indexes from the table ``other`` to the end of this table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is and is not checked for compatibility with any existing schema on this table. @endrst @param self A pointer to a tsk_migration_table_t object where rows are to be added. @param other A pointer to a tsk_migration_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_extend(tsk_migration_table_t *self, const tsk_migration_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); /** @brief Subset this table by keeping rows according to a boolean mask. @rst Deletes rows from this table and optionally return the mapping from IDs in the current table to the updated table. Rows are kept or deleted according to the specified boolean array ``keep`` such that for each row ``j`` if ``keep[j]`` is false (zero) the row is deleted, and otherwise the row is retained. Thus, ``keep`` must be an array of at least ``num_rows`` :c:type:`bool` values. If the ``id_map`` argument is non-null, this array will be updated to represent the mapping between IDs before and after row deletion. For row ``j``, ``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or :c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an array of at least ``num_rows`` :c:type:`tsk_id_t` values. .. warning:: C++ users need to be careful to specify the correct type when passing in values for the ``keep`` array, using ``std::vector`` and not ``std::vector``, as the latter may not be correct size. @endrst @param self A pointer to a tsk_migration_table_t object. @param keep Array of boolean flags describing whether a particular row should be kept or not. Must be at least ``num_rows`` long. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @param id_map An array in which to store the mapping between new and old IDs. If NULL, this will be ignored. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_keep_rows(tsk_migration_table_t *self, const tsk_bool_t *keep, tsk_flags_t options, tsk_id_t *id_map); /** @brief Returns true if the data in the specified table is identical to the data in this table. @rst **Options** Options to control the comparison can be specified by providing one or more of the following bitwise flags. By default (options=0) tables are considered equal if they are byte-wise identical in all columns, and their metadata schemas are byte-wise identical. - :c:macro:`TSK_CMP_IGNORE_METADATA` @endrst @param self A pointer to a tsk_migration_table_t object. @param other A pointer to a tsk_migration_table_t object. @param options Bitwise comparison options. @return Return true if the specified table is equal to this table. */ bool tsk_migration_table_equals(const tsk_migration_table_t *self, const tsk_migration_table_t *other, tsk_flags_t options); /** @brief Copies the state of this table into the specified destination. @rst By default the method initialises the specified destination table. If the destination is already initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid leaking memory. @endrst @param self A pointer to a tsk_migration_table_t object. @param dest A pointer to a tsk_migration_table_t object. If the TSK_NO_INIT option is specified, this must be an initialised migration table. If not, it must be an uninitialised migration table. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_copy( const tsk_migration_table_t *self, tsk_migration_table_t *dest, tsk_flags_t options); /** @brief Get the row at the specified index. @rst Updates the specified migration struct to reflect the values in the specified row. Pointers to memory within this struct are handled by the table and should **not** be freed by client code. These pointers are guaranteed to be valid until the next operation that modifies the table (e.g., by adding a new row), but not afterwards. @endrst @param self A pointer to a tsk_migration_table_t object. @param index The requested table row. @param row A pointer to a tsk_migration_t struct that is updated to reflect the values in the specified row. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_get_row( const tsk_migration_table_t *self, tsk_id_t index, tsk_migration_t *row); /** @brief Set the metadata schema @rst Copies the metadata schema string to this table, replacing any existing. @endrst @param self A pointer to a tsk_migration_table_t object. @param metadata_schema A pointer to a char array. @param metadata_schema_length The size of the metadata schema in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_set_metadata_schema(tsk_migration_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length); /** @brief Print out the state of this table to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_migration_table_t object. @param out The stream to write the summary to. */ void tsk_migration_table_print_state(const tsk_migration_table_t *self, FILE *out); /** @brief Replace this table's data by copying from a set of column arrays @rst Clears the data columns of this table and then copies column data from the specified set of arrays. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_migration_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param left The array of double left values to be copied. @param right The array of double right values to be copied. @param node The array of tsk_id_t node values to be copied. @param source The array of tsk_id_t source values to be copied. @param dest The array of tsk_id_t dest values to be copied. @param time The array of double time values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_set_columns(tsk_migration_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *node, const tsk_id_t *source, const tsk_id_t *dest, const double *time, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Extends this table by copying from a set of column arrays @rst Copies column data from the specified set of arrays to create new rows at the end of the table. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_migration_table_t object. @param num_rows The number of rows to copy from the specifed arrays @param left The array of double left values to be copied. @param right The array of double right values to be copied. @param node The array of tsk_id_t node values to be copied. @param source The array of tsk_id_t source values to be copied. @param dest The array of tsk_id_t dest values to be copied. @param time The array of double time values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_append_columns(tsk_migration_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *node, const tsk_id_t *source, const tsk_id_t *dest, const double *time, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Controls the pre-allocation strategy for this table @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_migration_table_t object. @param max_rows_increment The number of rows to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_set_max_rows_increment( tsk_migration_table_t *self, tsk_size_t max_rows_increment); /** @brief Controls the pre-allocation strategy for the metadata column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_migration_table_t object. @param max_metadata_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_migration_table_set_max_metadata_length_increment( tsk_migration_table_t *self, tsk_size_t max_metadata_length_increment); /** @} */ /* Undocumented methods */ int tsk_migration_table_dump_text(const tsk_migration_table_t *self, FILE *out); /** @defgroup SITE_TABLE_API_GROUP Site table API. @{ */ /** @brief Initialises the table by allocating the internal memory. @rst This must be called before any operations are performed on the table. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. @endrst @param self A pointer to an uninitialised tsk_site_table_t object. @param options Allocation time options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_init(tsk_site_table_t *self, tsk_flags_t options); /** @brief Free the internal memory for the specified table. @param self A pointer to an initialised tsk_site_table_t object. @return Always returns 0. */ int tsk_site_table_free(tsk_site_table_t *self); /** @brief Adds a row to this site table. @rst Add a new site with the specified ``position``, ``ancestral_state`` and ``metadata`` to the table. Copies of ``ancestral_state`` and ``metadata`` are immediately taken. See the :ref:`table definition ` for details of the columns in this table. @endrst @param self A pointer to a tsk_site_table_t object. @param position The position coordinate for the new site. @param ancestral_state The ancestral_state for the new site. @param ancestral_state_length The length of the ancestral_state in bytes. @param metadata The metadata to be associated with the new site. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return the ID of the newly added site on success, or a negative value on failure. */ tsk_id_t tsk_site_table_add_row(tsk_site_table_t *self, double position, const char *ancestral_state, tsk_size_t ancestral_state_length, const char *metadata, tsk_size_t metadata_length); /** @brief Updates the row at the specified index. @rst Rewrite the row at the specified index in this table to use the specified values. Copies of the ``ancestral_state`` and ``metadata`` parameters are taken immediately. See the :ref:`table definition ` for details of the columns in this table. .. warning:: Because of the way that ragged columns are encoded, this method requires a full rewrite of the internal column memory in worst case, and would therefore be inefficient for bulk updates for such columns. However, if the sizes of all ragged column values are unchanged in the updated row, this method is guaranteed to only update the memory for the row in question. @endrst @param self A pointer to a tsk_site_table_t object. @param index The row to update. @param position The position coordinate for the site. @param ancestral_state The ancestral_state for the site. @param ancestral_state_length The length of the ancestral_state in bytes. @param metadata The metadata to be associated with the site. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_update_row(tsk_site_table_t *self, tsk_id_t index, double position, const char *ancestral_state, tsk_size_t ancestral_state_length, const char *metadata, tsk_size_t metadata_length); /** @brief Clears this table, setting the number of rows to zero. @rst No memory is freed as a result of this operation; please use :c:func:`tsk_site_table_free` to free the table's internal resources. Note that the metadata schema is not cleared. @endrst @param self A pointer to a tsk_site_table_t object. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_clear(tsk_site_table_t *self); /** @brief Truncates this table so that only the first num_rows are retained. @param self A pointer to a tsk_site_table_t object. @param num_rows The number of rows to retain in the table. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_truncate(tsk_site_table_t *self, tsk_size_t num_rows); /** @brief Extends this table by appending rows copied from another table. @rst Appends the rows at the specified indexes from the table ``other`` to the end of this table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is and is not checked for compatibility with any existing schema on this table. @endrst @param self A pointer to a tsk_site_table_t object where rows are to be added. @param other A pointer to a tsk_site_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_extend(tsk_site_table_t *self, const tsk_site_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); /** @brief Subset this table by keeping rows according to a boolean mask. @rst Deletes rows from this table and optionally return the mapping from IDs in the current table to the updated table. Rows are kept or deleted according to the specified boolean array ``keep`` such that for each row ``j`` if ``keep[j]`` is false (zero) the row is deleted, and otherwise the row is retained. Thus, ``keep`` must be an array of at least ``num_rows`` :c:type:`bool` values. If the ``id_map`` argument is non-null, this array will be updated to represent the mapping between IDs before and after row deletion. For row ``j``, ``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or :c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an array of at least ``num_rows`` :c:type:`tsk_id_t` values. .. warning:: C++ users need to be careful to specify the correct type when passing in values for the ``keep`` array, using ``std::vector`` and not ``std::vector``, as the latter may not be correct size. @endrst @param self A pointer to a tsk_site_table_t object. @param keep Array of boolean flags describing whether a particular row should be kept or not. Must be at least ``num_rows`` long. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @param id_map An array in which to store the mapping between new and old IDs. If NULL, this will be ignored. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_keep_rows(tsk_site_table_t *self, const tsk_bool_t *keep, tsk_flags_t options, tsk_id_t *id_map); /** @brief Returns true if the data in the specified table is identical to the data in this table. @rst **Options** Options to control the comparison can be specified by providing one or more of the following bitwise flags. By default (options=0) tables are considered equal if they are byte-wise identical in all columns, and their metadata schemas are byte-wise identical. - :c:macro:`TSK_CMP_IGNORE_METADATA` @endrst @param self A pointer to a tsk_site_table_t object. @param other A pointer to a tsk_site_table_t object. @param options Bitwise comparison options. @return Return true if the specified table is equal to this table. */ bool tsk_site_table_equals( const tsk_site_table_t *self, const tsk_site_table_t *other, tsk_flags_t options); /** @brief Copies the state of this table into the specified destination. @rst By default the method initialises the specified destination table. If the destination is already initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid leaking memory. @endrst @param self A pointer to a tsk_site_table_t object. @param dest A pointer to a tsk_site_table_t object. If the TSK_NO_INIT option is specified, this must be an initialised site table. If not, it must be an uninitialised site table. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_copy( const tsk_site_table_t *self, tsk_site_table_t *dest, tsk_flags_t options); /** @brief Get the row at the specified index. @rst Updates the specified site struct to reflect the values in the specified row. This function always sets the ``mutations`` and ``mutations_length`` fields in the parameter :c:struct:`tsk_site_t` to ``NULL`` and ``0`` respectively. To get access to the mutations for a particular site, please use the tree sequence method, :c:func:`tsk_treeseq_get_site`. Pointers to memory within this struct are handled by the table and should **not** be freed by client code. These pointers are guaranteed to be valid until the next operation that modifies the table (e.g., by adding a new row), but not afterwards. @endrst @param self A pointer to a tsk_site_table_t object. @param index The requested table row. @param row A pointer to a tsk_site_t struct that is updated to reflect the values in the specified row. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_get_row( const tsk_site_table_t *self, tsk_id_t index, tsk_site_t *row); /** @brief Set the metadata schema @rst Copies the metadata schema string to this table, replacing any existing. @endrst @param self A pointer to a tsk_site_table_t object. @param metadata_schema A pointer to a char array. @param metadata_schema_length The size of the metadata schema in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_set_metadata_schema(tsk_site_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length); /** @brief Print out the state of this table to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_site_table_t object. @param out The stream to write the summary to. */ void tsk_site_table_print_state(const tsk_site_table_t *self, FILE *out); /** @brief Replace this table's data by copying from a set of column arrays @rst Clears the data columns of this table and then copies column data from the specified set of arrays. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_site_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param position The array of double position values to be copied. @param ancestral_state The array of char ancestral state values to be copied. @param ancestral_state_offset The array of tsk_size_t ancestral state offset values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_set_columns(tsk_site_table_t *self, tsk_size_t num_rows, const double *position, const char *ancestral_state, const tsk_size_t *ancestral_state_offset, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Extends this table by copying from a set of column arrays @rst Copies column data from the specified set of arrays to create new rows at the end of the table. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_site_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param position The array of double position values to be copied. @param ancestral_state The array of char ancestral state values to be copied. @param ancestral_state_offset The array of tsk_size_t ancestral state offset values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_append_columns(tsk_site_table_t *self, tsk_size_t num_rows, const double *position, const char *ancestral_state, const tsk_size_t *ancestral_state_offset, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Controls the pre-allocation strategy for this table @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_site_table_t object. @param max_rows_increment The number of rows to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_set_max_rows_increment( tsk_site_table_t *self, tsk_size_t max_rows_increment); /** @brief Controls the pre-allocation strategy for the metadata column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_site_table_t object. @param max_metadata_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_set_max_metadata_length_increment( tsk_site_table_t *self, tsk_size_t max_metadata_length_increment); /** @brief Controls the pre-allocation strategy for the ancestral_state column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_site_table_t object. @param max_ancestral_state_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_site_table_set_max_ancestral_state_length_increment( tsk_site_table_t *self, tsk_size_t max_ancestral_state_length_increment); /** @} */ /* Undocumented methods */ int tsk_site_table_dump_text(const tsk_site_table_t *self, FILE *out); /** @defgroup MUTATION_TABLE_API_GROUP Mutation table API. @{ */ /** @brief Initialises the table by allocating the internal memory. @rst This must be called before any operations are performed on the table. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. @endrst @param self A pointer to an uninitialised tsk_mutation_table_t object. @param options Allocation time options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_init(tsk_mutation_table_t *self, tsk_flags_t options); /** @brief Free the internal memory for the specified table. @param self A pointer to an initialised tsk_mutation_table_t object. @return Always returns 0. */ int tsk_mutation_table_free(tsk_mutation_table_t *self); /** @brief Adds a row to this mutation table. @rst Add a new mutation with the specified ``site``, ``parent``, ``derived_state`` and ``metadata`` to the table. Copies of ``derived_state`` and ``metadata`` are immediately taken. See the :ref:`table definition ` for details of the columns in this table. @endrst @param self A pointer to a tsk_mutation_table_t object. @param site The site ID for the new mutation. @param node The ID of the node this mutation occurs over. @param parent The ID of the parent mutation. @param time The time of the mutation. @param derived_state The derived_state for the new mutation. @param derived_state_length The length of the derived_state in bytes. @param metadata The metadata to be associated with the new mutation. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return the ID of the newly added mutation on success, or a negative value on failure. */ tsk_id_t tsk_mutation_table_add_row(tsk_mutation_table_t *self, tsk_id_t site, tsk_id_t node, tsk_id_t parent, double time, const char *derived_state, tsk_size_t derived_state_length, const char *metadata, tsk_size_t metadata_length); /** @brief Updates the row at the specified index. @rst Rewrite the row at the specified index in this table to use the specified values. Copies of the ``derived_state`` and ``metadata`` parameters are taken immediately. See the :ref:`table definition ` for details of the columns in this table. .. warning:: Because of the way that ragged columns are encoded, this method requires a full rewrite of the internal column memory in worst case, and would therefore be inefficient for bulk updates for such columns. However, if the sizes of all ragged column values are unchanged in the updated row, this method is guaranteed to only update the memory for the row in question. @endrst @param self A pointer to a tsk_mutation_table_t object. @param index The row to update. @param site The site ID for the mutation. @param node The ID of the node this mutation occurs over. @param parent The ID of the parent mutation. @param time The time of the mutation. @param derived_state The derived_state for the mutation. @param derived_state_length The length of the derived_state in bytes. @param metadata The metadata to be associated with the mutation. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_update_row(tsk_mutation_table_t *self, tsk_id_t index, tsk_id_t site, tsk_id_t node, tsk_id_t parent, double time, const char *derived_state, tsk_size_t derived_state_length, const char *metadata, tsk_size_t metadata_length); /** @brief Clears this table, setting the number of rows to zero. @rst No memory is freed as a result of this operation; please use :c:func:`tsk_mutation_table_free` to free the table's internal resources. Note that the metadata schema is not cleared. @endrst @param self A pointer to a tsk_mutation_table_t object. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_clear(tsk_mutation_table_t *self); /** @brief Truncates this table so that only the first num_rows are retained. @param self A pointer to a tsk_mutation_table_t object. @param num_rows The number of rows to retain in the table. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_truncate(tsk_mutation_table_t *self, tsk_size_t num_rows); /** @brief Extends this table by appending rows copied from another table. @rst Appends the rows at the specified indexes from the table ``other`` to the end of this table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is and is not checked for compatibility with any existing schema on this table. @endrst @param self A pointer to a tsk_mutation_table_t object where rows are to be added. @param other A pointer to a tsk_mutation_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_extend(tsk_mutation_table_t *self, const tsk_mutation_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); /** @brief Subset this table by keeping rows according to a boolean mask. @rst Deletes rows from this table and optionally return the mapping from IDs in the current table to the updated table. Rows are kept or deleted according to the specified boolean array ``keep`` such that for each row ``j`` if ``keep[j]`` is false (zero) the row is deleted, and otherwise the row is retained. Thus, ``keep`` must be an array of at least ``num_rows`` :c:type:`bool` values. If the ``id_map`` argument is non-null, this array will be updated to represent the mapping between IDs before and after row deletion. For row ``j``, ``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or :c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an array of at least ``num_rows`` :c:type:`tsk_id_t` values. The values in the ``parent`` column are updated according to this map, so that reference integrity within the table is maintained. As a consequence of this, the values in the ``parent`` column for kept rows are bounds-checked and an error raised if they are not valid. Rows that are deleted are not checked for parent ID integrity. If an attempt is made to delete rows that are referred to by the ``parent`` column of rows that are retained, an error is raised. These error conditions are checked before any alterations to the table are made. .. warning:: C++ users need to be careful to specify the correct type when passing in values for the ``keep`` array, using ``std::vector`` and not ``std::vector``, as the latter may not be correct size. @endrst @param self A pointer to a tsk_mutation_table_t object. @param keep Array of boolean flags describing whether a particular row should be kept or not. Must be at least ``num_rows`` long. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @param id_map An array in which to store the mapping between new and old IDs. If NULL, this will be ignored. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_keep_rows(tsk_mutation_table_t *self, const tsk_bool_t *keep, tsk_flags_t options, tsk_id_t *id_map); /** @brief Returns true if the data in the specified table is identical to the data in this table. @rst **Options** Options to control the comparison can be specified by providing one or more of the following bitwise flags. By default (options=0) tables are considered equal if they are byte-wise identical in all columns, and their metadata schemas are byte-wise identical. - :c:macro:`TSK_CMP_IGNORE_METADATA` @endrst @param self A pointer to a tsk_mutation_table_t object. @param other A pointer to a tsk_mutation_table_t object. @param options Bitwise comparison options. @return Return true if the specified table is equal to this table. */ bool tsk_mutation_table_equals(const tsk_mutation_table_t *self, const tsk_mutation_table_t *other, tsk_flags_t options); /** @brief Copies the state of this table into the specified destination. @rst By default the method initialises the specified destination table. If the destination is already initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid leaking memory. @endrst @param self A pointer to a tsk_mutation_table_t object. @param dest A pointer to a tsk_mutation_table_t object. If the TSK_NO_INIT option is specified, this must be an initialised mutation table. If not, it must be an uninitialised mutation table. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_copy( const tsk_mutation_table_t *self, tsk_mutation_table_t *dest, tsk_flags_t options); /** @brief Get the row at the specified index. @rst Updates the specified mutation struct to reflect the values in the specified row. This function always sets the ``edge`` field in parameter :c:struct:`tsk_mutation_t` to ``TSK_NULL``. To determine the ID of the edge associated with a particular mutation, please use the tree sequence method, :c:func:`tsk_treeseq_get_mutation`. Pointers to memory within this struct are handled by the table and should **not** be freed by client code. These pointers are guaranteed to be valid until the next operation that modifies the table (e.g., by adding a new row), but not afterwards. @endrst @param self A pointer to a tsk_mutation_table_t object. @param index The requested table row. @param row A pointer to a tsk_mutation_t struct that is updated to reflect the values in the specified row. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_get_row( const tsk_mutation_table_t *self, tsk_id_t index, tsk_mutation_t *row); /** @brief Set the metadata schema @rst Copies the metadata schema string to this table, replacing any existing. @endrst @param self A pointer to a tsk_mutation_table_t object. @param metadata_schema A pointer to a char array. @param metadata_schema_length The size of the metadata schema in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_set_metadata_schema(tsk_mutation_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length); /** @brief Print out the state of this table to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_mutation_table_t object. @param out The stream to write the summary to. */ void tsk_mutation_table_print_state(const tsk_mutation_table_t *self, FILE *out); /** @brief Replace this table's data by copying from a set of column arrays @rst Clears the data columns of this table and then copies column data from the specified set of arrays. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_mutation_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param site The array of tsk_id_t site values to be copied. @param node The array of tsk_id_t node values to be copied. @param parent The array of tsk_id_t parent values to be copied. @param time The array of double time values to be copied. @param derived_state The array of char derived_state values to be copied. @param derived_state_offset The array of tsk_size_t derived state offset values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_set_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, const tsk_id_t *site, const tsk_id_t *node, const tsk_id_t *parent, const double *time, const char *derived_state, const tsk_size_t *derived_state_offset, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Extends this table by copying from a set of column arrays @rst Copies column data from the specified set of arrays to create new rows at the end of the table. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_mutation_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param site The array of tsk_id_t site values to be copied. @param node The array of tsk_id_t node values to be copied. @param parent The array of tsk_id_t parent values to be copied. @param time The array of double time values to be copied. @param derived_state The array of char derived_state values to be copied. @param derived_state_offset The array of tsk_size_t derived state offset values to be copied. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_append_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, const tsk_id_t *site, const tsk_id_t *node, const tsk_id_t *parent, const double *time, const char *derived_state, const tsk_size_t *derived_state_offset, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Controls the pre-allocation strategy for this table @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_mutation_table_t object. @param max_rows_increment The number of rows to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_set_max_rows_increment( tsk_mutation_table_t *self, tsk_size_t max_rows_increment); /** @brief Controls the pre-allocation strategy for the metadata column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_mutation_table_t object. @param max_metadata_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_set_max_metadata_length_increment( tsk_mutation_table_t *self, tsk_size_t max_metadata_length_increment); /** @brief Controls the pre-allocation strategy for the derived_state column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_mutation_table_t object. @param max_derived_state_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_mutation_table_set_max_derived_state_length_increment( tsk_mutation_table_t *self, tsk_size_t max_derived_state_length_increment); /** @} */ /* Undocumented methods */ int tsk_mutation_table_dump_text(const tsk_mutation_table_t *self, FILE *out); /** @defgroup POPULATION_TABLE_API_GROUP Population table API. @{ */ /** @brief Initialises the table by allocating the internal memory. @rst This must be called before any operations are performed on the table. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. @endrst @param self A pointer to an uninitialised tsk_population_table_t object. @param options Allocation time options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_init(tsk_population_table_t *self, tsk_flags_t options); /** @brief Free the internal memory for the specified table. @param self A pointer to an initialised tsk_population_table_t object. @return Always returns 0. */ int tsk_population_table_free(tsk_population_table_t *self); /** @brief Adds a row to this population table. @rst Add a new population with the specified ``metadata`` to the table. A copy of the ``metadata`` is immediately taken. See the :ref:`table definition ` for details of the columns in this table. @endrst @param self A pointer to a tsk_population_table_t object. @param metadata The metadata to be associated with the new population. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return the ID of the newly added population on success, or a negative value on failure. */ tsk_id_t tsk_population_table_add_row( tsk_population_table_t *self, const char *metadata, tsk_size_t metadata_length); /** @brief Updates the row at the specified index. @rst Rewrite the row at the specified index in this table to use the specified values. A copy of the ``metadata`` parameter is taken immediately. See the :ref:`table definition ` for details of the columns in this table. .. warning:: Because of the way that ragged columns are encoded, this method requires a full rewrite of the internal column memory in worst case, and would therefore be inefficient for bulk updates for such columns. However, if the sizes of all ragged column values are unchanged in the updated row, this method is guaranteed to only update the memory for the row in question. @endrst @param self A pointer to a tsk_population_table_t object. @param index The row to update. @param metadata The metadata to be associated with the population. This is a pointer to arbitrary memory. Can be ``NULL`` if ``metadata_length`` is 0. @param metadata_length The size of the metadata array in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_update_row(tsk_population_table_t *self, tsk_id_t index, const char *metadata, tsk_size_t metadata_length); /** @brief Clears this table, setting the number of rows to zero. @rst No memory is freed as a result of this operation; please use :c:func:`tsk_population_table_free` to free the table's internal resources. Note that the metadata schema is not cleared. @endrst @param self A pointer to a tsk_population_table_t object. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_clear(tsk_population_table_t *self); /** @brief Truncates this table so that only the first num_rows are retained. @param self A pointer to a tsk_population_table_t object. @param num_rows The number of rows to retain in the table. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_truncate(tsk_population_table_t *self, tsk_size_t num_rows); /** @brief Extends this table by appending rows copied from another table. @rst Appends the rows at the specified indexes from the table ``other`` to the end of this table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append the first ``num_rows`` from ``other`` to this table. Note that metadata is copied as-is and is not checked for compatibility with any existing schema on this table. @endrst @param self A pointer to a tsk_population_table_t object where rows are to be added. @param other A pointer to a tsk_population_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_extend(tsk_population_table_t *self, const tsk_population_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); /** @brief Subset this table by keeping rows according to a boolean mask. @rst Deletes rows from this table and optionally return the mapping from IDs in the current table to the updated table. Rows are kept or deleted according to the specified boolean array ``keep`` such that for each row ``j`` if ``keep[j]`` is false (zero) the row is deleted, and otherwise the row is retained. Thus, ``keep`` must be an array of at least ``num_rows`` :c:type:`bool` values. If the ``id_map`` argument is non-null, this array will be updated to represent the mapping between IDs before and after row deletion. For row ``j``, ``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or :c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an array of at least ``num_rows`` :c:type:`tsk_id_t` values. .. warning:: C++ users need to be careful to specify the correct type when passing in values for the ``keep`` array, using ``std::vector`` and not ``std::vector``, as the latter may not be correct size. @endrst @param self A pointer to a tsk_population_table_t object. @param keep Array of boolean flags describing whether a particular row should be kept or not. Must be at least ``num_rows`` long. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @param id_map An array in which to store the mapping between new and old IDs. If NULL, this will be ignored. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_keep_rows(tsk_population_table_t *self, const tsk_bool_t *keep, tsk_flags_t options, tsk_id_t *id_map); /** @brief Returns true if the data in the specified table is identical to the data in this table. @rst **Options** Options to control the comparison can be specified by providing one or more of the following bitwise flags. By default (options=0) tables are considered equal if they are byte-wise identical in all columns, and their metadata schemas are byte-wise identical. - :c:macro:`TSK_CMP_IGNORE_METADATA` Do not include metadata in the comparison. Note that as metadata is the only column in the population table, two population tables are considered equal if they have the same number of rows if this flag is specified. @endrst @param self A pointer to a tsk_population_table_t object. @param other A pointer to a tsk_population_table_t object. @param options Bitwise comparison options. @return Return true if the specified table is equal to this table. */ bool tsk_population_table_equals(const tsk_population_table_t *self, const tsk_population_table_t *other, tsk_flags_t options); /** @brief Copies the state of this table into the specified destination. @rst By default the method initialises the specified destination table. If the destination is already initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid leaking memory. @endrst @param self A pointer to a tsk_population_table_t object. @param dest A pointer to a tsk_population_table_t object. If the TSK_NO_INIT option is specified, this must be an initialised population table. If not, it must be an uninitialised population table. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_copy(const tsk_population_table_t *self, tsk_population_table_t *dest, tsk_flags_t options); /** @brief Get the row at the specified index. @rst Updates the specified population struct to reflect the values in the specified row. Pointers to memory within this struct are handled by the table and should **not** be freed by client code. These pointers are guaranteed to be valid until the next operation that modifies the table (e.g., by adding a new row), but not afterwards. @endrst @param self A pointer to a tsk_population_table_t object. @param index The requested table row. @param row A pointer to a tsk_population_t struct that is updated to reflect the values in the specified row. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_get_row( const tsk_population_table_t *self, tsk_id_t index, tsk_population_t *row); /** @brief Set the metadata schema @rst Copies the metadata schema string to this table, replacing any existing. @endrst @param self A pointer to a tsk_population_table_t object. @param metadata_schema A pointer to a char array. @param metadata_schema_length The size of the metadata schema in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_set_metadata_schema(tsk_population_table_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length); /** @brief Print out the state of this table to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_population_table_t object. @param out The stream to write the summary to. */ void tsk_population_table_print_state(const tsk_population_table_t *self, FILE *out); /** @brief Replace this table's data by copying from a set of column arrays @rst Clears the data columns of this table and then copies column data from the specified set of arrays. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_population_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_set_columns(tsk_population_table_t *self, tsk_size_t num_rows, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Extends this table by copying from a set of column arrays @rst Copies column data from the specified set of arrays to create new rows at the end of the table. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_population_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param metadata The array of char metadata values to be copied. @param metadata_offset The array of tsk_size_t metadata offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_append_columns(tsk_population_table_t *self, tsk_size_t num_rows, const char *metadata, const tsk_size_t *metadata_offset); /** @brief Controls the pre-allocation strategy for this table @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_population_table_t object. @param max_rows_increment The number of rows to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_set_max_rows_increment( tsk_population_table_t *self, tsk_size_t max_rows_increment); /** @brief Controls the pre-allocation strategy for the metadata column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_population_table_t object. @param max_metadata_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_population_table_set_max_metadata_length_increment( tsk_population_table_t *self, tsk_size_t max_metadata_length_increment); /** @} */ /* Undocumented methods */ int tsk_population_table_dump_text(const tsk_population_table_t *self, FILE *out); /** @defgroup PROVENANCE_TABLE_API_GROUP Provenance table API. @{ */ /** @brief Initialises the table by allocating the internal memory. @rst This must be called before any operations are performed on the table. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. @endrst @param self A pointer to an uninitialised tsk_provenance_table_t object. @param options Allocation time options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_init(tsk_provenance_table_t *self, tsk_flags_t options); /** @brief Free the internal memory for the specified table. @param self A pointer to an initialised tsk_provenance_table_t object. @return Always returns 0. */ int tsk_provenance_table_free(tsk_provenance_table_t *self); /** @brief Adds a row to this provenance table. @rst Add a new provenance with the specified ``timestamp`` and ``record`` to the table. Copies of the ``timestamp`` and ``record`` are immediately taken. See the :ref:`table definition ` for details of the columns in this table. @endrst @param self A pointer to a tsk_provenance_table_t object. @param timestamp The timestamp to be associated with the new provenance. This is a pointer to arbitrary memory. Can be ``NULL`` if ``timestamp_length`` is 0. @param timestamp_length The size of the timestamp array in bytes. @param record The record to be associated with the new provenance. This is a pointer to arbitrary memory. Can be ``NULL`` if ``record_length`` is 0. @param record_length The size of the record array in bytes. @return Return the ID of the newly added provenance on success, or a negative value on failure. */ tsk_id_t tsk_provenance_table_add_row(tsk_provenance_table_t *self, const char *timestamp, tsk_size_t timestamp_length, const char *record, tsk_size_t record_length); /** @brief Updates the row at the specified index. @rst Rewrite the row at the specified index in this table to use the specified values. Copies of the ``timestamp`` and ``record`` parameters are taken immediately. See the :ref:`table definition ` for details of the columns in this table. .. warning:: Because of the way that ragged columns are encoded, this method requires a full rewrite of the internal column memory in worst case, and would therefore be inefficient for bulk updates for such columns. However, if the sizes of all ragged column values are unchanged in the updated row, this method is guaranteed to only update the memory for the row in question. @endrst @param self A pointer to a tsk_provenance_table_t object. @param index The row to update. @param timestamp The timestamp to be associated with new provenance. This is a pointer to arbitrary memory. Can be ``NULL`` if ``timestamp_length`` is 0. @param timestamp_length The size of the timestamp array in bytes. @param record The record to be associated with the provenance. This is a pointer to arbitrary memory. Can be ``NULL`` if ``record_length`` is 0. @param record_length The size of the record array in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_update_row(tsk_provenance_table_t *self, tsk_id_t index, const char *timestamp, tsk_size_t timestamp_length, const char *record, tsk_size_t record_length); /** @brief Clears this table, setting the number of rows to zero. @rst No memory is freed as a result of this operation; please use :c:func:`tsk_provenance_table_free` to free the table's internal resources. @endrst @param self A pointer to a tsk_provenance_table_t object. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_clear(tsk_provenance_table_t *self); /** @brief Truncates this table so that only the first num_rows are retained. @param self A pointer to a tsk_provenance_table_t object. @param num_rows The number of rows to retain in the table. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_truncate(tsk_provenance_table_t *self, tsk_size_t num_rows); /** @brief Extends this table by appending rows copied from another table. @rst Appends the rows at the specified indexes from the table ``other`` to the end of this table. Row indexes can be repeated and in any order. If ``row_indexes`` is NULL, append the first ``num_rows`` from ``other`` to this table. @endrst @param self A pointer to a tsk_provenance_table_t object where rows are to be added. @param other A pointer to a tsk_provenance_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_extend(tsk_provenance_table_t *self, const tsk_provenance_table_t *other, tsk_size_t num_rows, const tsk_id_t *row_indexes, tsk_flags_t options); /** @brief Subset this table by keeping rows according to a boolean mask. @rst Deletes rows from this table and optionally return the mapping from IDs in the current table to the updated table. Rows are kept or deleted according to the specified boolean array ``keep`` such that for each row ``j`` if ``keep[j]`` is false (zero) the row is deleted, and otherwise the row is retained. Thus, ``keep`` must be an array of at least ``num_rows`` :c:type:`bool` values. If the ``id_map`` argument is non-null, this array will be updated to represent the mapping between IDs before and after row deletion. For row ``j``, ``id_map[j]`` will contain the new ID for row ``j`` if it is retained, or :c:macro:`TSK_NULL` if the row has been removed. Thus, ``id_map`` must be an array of at least ``num_rows`` :c:type:`tsk_id_t` values. .. warning:: C++ users need to be careful to specify the correct type when passing in values for the ``keep`` array, using ``std::vector`` and not ``std::vector``, as the latter may not be correct size. @endrst @param self A pointer to a tsk_provenance_table_t object. @param keep Array of boolean flags describing whether a particular row should be kept or not. Must be at least ``num_rows`` long. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @param id_map An array in which to store the mapping between new and old IDs. If NULL, this will be ignored. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_keep_rows(tsk_provenance_table_t *self, const tsk_bool_t *keep, tsk_flags_t options, tsk_id_t *id_map); /** @brief Returns true if the data in the specified table is identical to the data in this table. @rst **Options** Options to control the comparison can be specified by providing one or more of the following bitwise flags. By default (options=0) tables are considered equal if they are byte-wise identical in all columns. - :c:macro:`TSK_CMP_IGNORE_TIMESTAMPS` @endrst @param self A pointer to a tsk_provenance_table_t object. @param other A pointer to a tsk_provenance_table_t object. @param options Bitwise comparison options. @return Return true if the specified table is equal to this table. */ bool tsk_provenance_table_equals(const tsk_provenance_table_t *self, const tsk_provenance_table_t *other, tsk_flags_t options); /** @brief Copies the state of this table into the specified destination. @rst By default the method initialises the specified destination table. If the destination is already initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid leaking memory. @endrst @param self A pointer to a tsk_provenance_table_t object. @param dest A pointer to a tsk_provenance_table_t object. If the TSK_NO_INIT option is specified, this must be an initialised provenance table. If not, it must be an uninitialised provenance table. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_copy(const tsk_provenance_table_t *self, tsk_provenance_table_t *dest, tsk_flags_t options); /** @brief Get the row at the specified index. @rst Updates the specified provenance struct to reflect the values in the specified row. Pointers to memory within this struct are handled by the table and should **not** be freed by client code. These pointers are guaranteed to be valid until the next operation that modifies the table (e.g., by adding a new row), but not afterwards. @endrst @param self A pointer to a tsk_provenance_table_t object. @param index The requested table row. @param row A pointer to a tsk_provenance_t struct that is updated to reflect the values in the specified row. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_get_row( const tsk_provenance_table_t *self, tsk_id_t index, tsk_provenance_t *row); /** @brief Print out the state of this table to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_provenance_table_t object. @param out The stream to write the summary to. */ void tsk_provenance_table_print_state(const tsk_provenance_table_t *self, FILE *out); /** @brief Replace this table's data by copying from a set of column arrays @rst Clears the data columns of this table and then copies column data from the specified set of arrays. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_provenance_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param timestamp The array of char timestamp values to be copied. @param timestamp_offset The array of tsk_size_t timestamp offset values to be copied. @param record The array of char record values to be copied. @param record_offset The array of tsk_size_t record offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_set_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, const char *timestamp, const tsk_size_t *timestamp_offset, const char *record, const tsk_size_t *record_offset); /** @brief Extends this table by copying from a set of column arrays @rst Copies column data from the specified set of arrays to create new rows at the end of the table. The supplied arrays should all contain data on the same number of rows. The metadata schema is not affected. @endrst @param self A pointer to a tsk_provenance_table_t object. @param num_rows The number of rows to copy from the specifed arrays. @param timestamp The array of char timestamp values to be copied. @param timestamp_offset The array of tsk_size_t timestamp offset values to be copied. @param record The array of char record values to be copied. @param record_offset The array of tsk_size_t record offset values to be copied. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_append_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, const char *timestamp, const tsk_size_t *timestamp_offset, const char *record, const tsk_size_t *record_offset); /** @brief Controls the pre-allocation strategy for this table @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_provenance_table_t object. @param max_rows_increment The number of rows to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_set_max_rows_increment( tsk_provenance_table_t *self, tsk_size_t max_rows_increment); /** @brief Controls the pre-allocation strategy for the timestamp column @rst Set a fixed pre-allocation size, or use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_provenance_table_t object. @param max_timestamp_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_set_max_timestamp_length_increment( tsk_provenance_table_t *self, tsk_size_t max_timestamp_length_increment); /** @brief Controls the pre-allocation strategy for the record column @rst Set a fixed pre-allocation size, use the default doubling strategy. See :ref:`sec_c_api_memory_allocation_strategy` for details on the default pre-allocation strategy, @endrst @param self A pointer to a tsk_provenance_table_t object. @param max_record_length_increment The number of bytes to pre-allocate, or zero for the default doubling strategy. @return Return 0 on success or a negative value on failure. */ int tsk_provenance_table_set_max_record_length_increment( tsk_provenance_table_t *self, tsk_size_t max_record_length_increment); /** @} */ /* Undocumented methods */ int tsk_provenance_table_dump_text(const tsk_provenance_table_t *self, FILE *out); /****************************************************************************/ /* Table collection .*/ /****************************************************************************/ /** @defgroup TABLE_COLLECTION_API_GROUP Table collection API. @{ */ /** @brief Initialises the table collection by allocating the internal memory and initialising all the constituent tables. @rst This must be called before any operations are performed on the table collection. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. **Options** Options can be specified by providing bitwise flags: - :c:macro:`TSK_TC_NO_EDGE_METADATA` @endrst @param self A pointer to an uninitialised tsk_table_collection_t object. @param options Allocation time options as above. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_init(tsk_table_collection_t *self, tsk_flags_t options); /** @brief Free the internal memory for the specified table collection. @param self A pointer to an initialised tsk_table_collection_t object. @return Always returns 0. */ int tsk_table_collection_free(tsk_table_collection_t *self); /** @brief Clears data tables (and optionally provenances and metadata) in this table collection. @rst By default this operation clears all tables except the provenance table, retaining table metadata schemas and the tree-sequence level metadata and schema. No memory is freed as a result of this operation; please use :c:func:`tsk_table_collection_free` to free internal resources. **Options** Options can be specified by providing one or more of the following bitwise flags: - :c:macro:`TSK_CLEAR_PROVENANCE` - :c:macro:`TSK_CLEAR_METADATA_SCHEMAS` - :c:macro:`TSK_CLEAR_TS_METADATA_AND_SCHEMA` @endrst @param self A pointer to a tsk_table_collection_t object. @param options Bitwise clearing options. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_clear(tsk_table_collection_t *self, tsk_flags_t options); /** @brief Returns true if the data in the specified table collection is equal to the data in this table collection. @rst Returns true if the two table collections are equal. The indexes are not considered as these are derived from the tables. We also do not consider the ``file_uuid``, since it is a property of the file that set of tables is stored in. **Options** Options to control the comparison can be specified by providing one or more of the following bitwise flags. By default (options=0) two table collections are considered equal if all of the tables are byte-wise identical, and the sequence lengths, metadata and metadata schemas of the two table collections are identical. - :c:macro:`TSK_CMP_IGNORE_PROVENANCE` - :c:macro:`TSK_CMP_IGNORE_METADATA` - :c:macro:`TSK_CMP_IGNORE_TS_METADATA` - :c:macro:`TSK_CMP_IGNORE_TIMESTAMPS` - :c:macro:`TSK_CMP_IGNORE_TABLES` - :c:macro:`TSK_CMP_IGNORE_REFERENCE_SEQUENCE` @endrst @param self A pointer to a tsk_table_collection_t object. @param other A pointer to a tsk_table_collection_t object. @param options Bitwise comparison options. @return Return true if the specified table collection is equal to this table. */ bool tsk_table_collection_equals(const tsk_table_collection_t *self, const tsk_table_collection_t *other, tsk_flags_t options); /** @brief Copies the state of this table collection into the specified destination. @rst By default the method initialises the specified destination table collection. If the destination is already initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid leaking memory. **Options** Options can be specified by providing bitwise flags: :c:macro:`TSK_COPY_FILE_UUID` @endrst @param self A pointer to a tsk_table_collection_t object. @param dest A pointer to a tsk_table_collection_t object. If the TSK_NO_INIT option is specified, this must be an initialised table collection. If not, it must be an uninitialised table collection. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_copy(const tsk_table_collection_t *self, tsk_table_collection_t *dest, tsk_flags_t options); /** @brief Print out the state of this table collection to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_table_collection_t object. @param out The stream to write the summary to. */ void tsk_table_collection_print_state(const tsk_table_collection_t *self, FILE *out); /** @brief Load a table collection from a file path. @rst Loads the data from the specified file into this table collection. By default, the table collection is also initialised. The resources allocated must be freed using :c:func:`tsk_table_collection_free` even in error conditions. If the :c:macro:`TSK_NO_INIT` option is set, the table collection is not initialised, allowing an already initialised table collection to be overwritten with the data from a file. If the file contains multiple table collections, this function will load the first. Please see the :c:func:`tsk_table_collection_loadf` for details on how to sequentially load table collections from a stream. If the :c:macro:`TSK_LOAD_SKIP_TABLES` option is set, only the non-table information from the table collection will be read, leaving all tables with zero rows and no metadata or schema. If the :c:macro:`TSK_LOAD_SKIP_REFERENCE_SEQUENCE` option is set, the table collection is read without loading the reference sequence. **Options** Options can be specified by providing one or more of the following bitwise flags: - :c:macro:`TSK_NO_INIT` - :c:macro:`TSK_LOAD_SKIP_TABLES` - :c:macro:`TSK_LOAD_SKIP_REFERENCE_SEQUENCE` **Examples** .. code-block:: c int ret; tsk_table_collection_t tables; ret = tsk_table_collection_load(&tables, "data.trees", 0); if (ret != 0) { fprintf(stderr, "Load error:%s\n", tsk_strerror(ret)); exit(EXIT_FAILURE); } @endrst @param self A pointer to an uninitialised tsk_table_collection_t object if the TSK_NO_INIT option is not set (default), or an initialised tsk_table_collection_t otherwise. @param filename A NULL terminated string containing the filename. @param options Bitwise options. See above for details. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_load( tsk_table_collection_t *self, const char *filename, tsk_flags_t options); /** @brief Load a table collection from a stream. @rst Loads a tables definition from the specified file stream to this table collection. By default, the table collection is also initialised. The resources allocated must be freed using :c:func:`tsk_table_collection_free` even in error conditions. If the :c:macro:`TSK_NO_INIT` option is set, the table collection is not initialised, allowing an already initialised table collection to be overwritten with the data from a file. The stream can be an arbitrary file descriptor, for example a network socket. No seek operations are performed. If the stream contains multiple table collection definitions, this function will load the next table collection from the stream. If the stream contains no more table collection definitions the error value :c:macro:`TSK_ERR_EOF` will be returned. Note that EOF is only returned in the case where zero bytes are read from the stream --- malformed files or other errors will result in different error conditions. Please see the :ref:`sec_c_api_examples_file_streaming` section for an example of how to sequentially load tree sequences from a stream. Please note that this streaming behaviour is not supported if the :c:macro:`TSK_LOAD_SKIP_TABLES` or :c:macro:`TSK_LOAD_SKIP_REFERENCE_SEQUENCE` option is set. If the :c:macro:`TSK_LOAD_SKIP_TABLES` option is set, only the non-table information from the table collection will be read, leaving all tables with zero rows and no metadata or schema. If the :c:macro:`TSK_LOAD_SKIP_REFERENCE_SEQUENCE` option is set, the table collection is read without loading the reference sequence. When attempting to read from a stream with multiple table collection definitions and either of these two options set, the requested information from the first table collection will be read on the first call to :c:func:`tsk_table_collection_loadf`, with subsequent calls leading to errors. **Options** Options can be specified by providing one or more of the following bitwise flags: - :c:macro:`TSK_NO_INIT` - :c:macro:`TSK_LOAD_SKIP_TABLES` - :c:macro:`TSK_LOAD_SKIP_REFERENCE_SEQUENCE` @endrst @param self A pointer to an uninitialised tsk_table_collection_t object if the TSK_NO_INIT option is not set (default), or an initialised tsk_table_collection_t otherwise. @param file A FILE stream opened in an appropriate mode for reading (e.g. "r", "r+" or "w+") positioned at the beginning of a table collection definition. @param options Bitwise options. See above for details. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_loadf( tsk_table_collection_t *self, FILE *file, tsk_flags_t options); /** @brief Write a table collection to file. @rst Writes the data from this table collection to the specified file. If an error occurs the file path is deleted, ensuring that only complete and well formed files will be written. **Examples** .. code-block:: c int ret; tsk_table_collection_t tables; ret = tsk_table_collection_init(&tables, 0); error_check(ret); tables.sequence_length = 1.0; // Write out the empty tree sequence ret = tsk_table_collection_dump(&tables, "empty.trees", 0); error_check(ret); @endrst @param self A pointer to an initialised tsk_table_collection_t object. @param filename A NULL terminated string containing the filename. @param options Bitwise options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_dump( const tsk_table_collection_t *self, const char *filename, tsk_flags_t options); /** @brief Write a table collection to a stream. @rst Writes the data from this table collection to the specified FILE stream. Semantics are identical to :c:func:`tsk_table_collection_dump`. Please see the :ref:`sec_c_api_examples_file_streaming` section for an example of how to sequentially dump and load tree sequences from a stream. @endrst @param self A pointer to an initialised tsk_table_collection_t object. @param file A FILE stream opened in an appropriate mode for writing (e.g. "w", "a", "r+" or "w+"). @param options Bitwise options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_dumpf( const tsk_table_collection_t *self, FILE *file, tsk_flags_t options); /** @brief Record the number of rows in each table in the specified tsk_bookmark_t object. @param self A pointer to an initialised tsk_table_collection_t object. @param bookmark A pointer to a tsk_bookmark_t which is updated to contain the number of rows in all tables. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_record_num_rows( const tsk_table_collection_t *self, tsk_bookmark_t *bookmark); /** @brief Truncates the tables in this table collection according to the specified bookmark. @rst Truncate the tables in this collection so that each one has the number of rows specified in the parameter :c:type:`tsk_bookmark_t`. Use the :c:func:`tsk_table_collection_record_num_rows` function to record the number rows for each table in a table collection at a particular time. @endrst @param self A pointer to a tsk_individual_table_t object. @param bookmark The number of rows to retain in each table. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_truncate( tsk_table_collection_t *self, tsk_bookmark_t *bookmark); /** @brief Sorts the tables in this collection. @rst Some of the tables in a table collection must satisfy specific sortedness requirements in order to define a :ref:`valid tree sequence `. This method sorts the ``edge``, ``site``, ``mutation`` and ``individual`` tables such that these requirements are guaranteed to be fulfilled. The ``node``, ``population`` and ``provenance`` tables do not have any sortedness requirements, and are therefore ignored by this method. .. note:: The current implementation **may** sort in such a way that exceeds these requirements, but this behaviour should not be relied upon and later versions may weaken the level of sortedness. However, the method does **guarantee** that the resulting tables describes a valid tree sequence. .. warning:: Sorting migrations is currently not supported and an error will be raised if a table collection containing a non-empty migration table is specified. The specified :c:type:`tsk_bookmark_t` allows us to specify a start position for sorting in each of the tables; rows before this value are assumed to already be in sorted order and this information is used to make sorting more efficient. Positions in tables that are not sorted (``node``, ``population`` and ``provenance``) are ignored and can be set to arbitrary values. .. warning:: The current implementation only supports specifying a start position for the ``edge`` table and in a limited form for the ``site``, ``mutation`` and ``individual`` tables. Specifying a non-zero ``migration``, start position results in an error. The start positions for the ``site``, ``mutation`` and ``individual`` tables can either be 0 or the length of the respective tables, allowing these tables to either be fully sorted, or not sorted at all. The table collection will always be unindexed after sort successfully completes. For more control over the sorting process, see the :ref:`sec_c_api_low_level_sorting` section. **Options** Options can be specified by providing one or more of the following bitwise flags: :c:macro:`TSK_NO_CHECK_INTEGRITY` Do not run integrity checks using :c:func:`tsk_table_collection_check_integrity` before sorting, potentially leading to a small reduction in execution time. This performance optimisation should not be used unless the calling code can guarantee reference integrity within the table collection. References to rows not in the table or bad offsets will result in undefined behaviour. @endrst @param self A pointer to a tsk_table_collection_t object. @param start The position to begin sorting in each table; all rows less than this position must fulfill the tree sequence sortedness requirements. If this is NULL, sort all rows. @param options Sort options. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_sort( tsk_table_collection_t *self, const tsk_bookmark_t *start, tsk_flags_t options); /** @brief Sorts the individual table in this collection. @rst Sorts the individual table in place, so that parents come before children, and the parent column is remapped as required. Node references to individuals are also updated. @endrst @param self A pointer to a tsk_table_collection_t object. @param options Sort options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_individual_topological_sort( tsk_table_collection_t *self, tsk_flags_t options); /** @brief Puts the tables into canonical form. @rst Put tables into canonical form such that randomly reshuffled tables are guaranteed to always be sorted in the same order, and redundant information is removed. The canonical sorting exceeds the usual tree sequence sortedness requirements. **Options**: Options can be specified by providing one or more of the following bitwise flags: - :c:macro:`TSK_SUBSET_KEEP_UNREFERENCED` @endrst @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t options); /** @brief Simplify the tables to remove redundant information. @rst Simplification transforms the tables to remove redundancy and canonicalise tree sequence data. See the :ref:`simplification ` tutorial for more details. A mapping from the node IDs in the table before simplification to their equivalent values after simplification can be obtained via the ``node_map`` argument. If this is non NULL, ``node_map[u]`` will contain the new ID for node ``u`` after simplification, or :c:macro:`TSK_NULL` if the node has been removed. Thus, ``node_map`` must be an array of at least ``self->nodes.num_rows`` :c:type:`tsk_id_t` values. If the `TSK_SIMPLIFY_NO_FILTER_NODES` option is specified, the node table will be unaltered except for changing the sample status of nodes (but see the `TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` option below) and to update references to other tables that may have changed as a result of filtering (see below). The ``node_map`` (if specified) will always be the identity mapping, such that ``node_map[u] == u`` for all nodes. Note also that the order of the list of samples is not important in this case. When a table is not filtered (i.e., if the `TSK_SIMPLIFY_NO_FILTER_NODES` option is provided or the `TSK_SIMPLIFY_FILTER_SITES`, `TSK_SIMPLIFY_FILTER_POPULATIONS` or `TSK_SIMPLIFY_FILTER_INDIVIDUALS` options are *not* provided) the corresponding table is modified as little as possible, and all pointers are guaranteed to remain valid after simplification. The only changes made to an unfiltered table are to update any references to tables that may have changed (for example, remapping population IDs in the node table if `TSK_SIMPLIFY_FILTER_POPULATIONS` was specified) or altering the sample status flag of nodes. .. note:: It is possible for populations and individuals to be filtered even if `TSK_SIMPLIFY_NO_FILTER_NODES` is specified because there may be entirely unreferenced entities in the input tables, which are not affected by whether we filter nodes or not. By default, the node sample flags are updated by unsetting the :c:macro:`TSK_NODE_IS_SAMPLE` flag for all nodes and subsequently setting it for the nodes provided as input to this function. The `TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` option will prevent this from occuring, making it the responsibility of calling code to keep track of the ultimate sample status of nodes. Using this option in conjunction with `TSK_SIMPLIFY_NO_FILTER_NODES` (and without the `TSK_SIMPLIFY_FILTER_POPULATIONS` and `TSK_SIMPLIFY_FILTER_INDIVIDUALS` options) guarantees that the node table will not be written to during the lifetime of this function. The table collection will always be unindexed after simplify successfully completes. .. note:: Migrations are currently not supported by simplify, and an error will be raised if we attempt call simplify on a table collection with greater than zero migrations. See ``_ **Options**: Options can be specified by providing one or more of the following bitwise flags: - :c:macro:`TSK_SIMPLIFY_FILTER_SITES` - :c:macro:`TSK_SIMPLIFY_FILTER_POPULATIONS` - :c:macro:`TSK_SIMPLIFY_FILTER_INDIVIDUALS` - :c:macro:`TSK_SIMPLIFY_NO_FILTER_NODES` - :c:macro:`TSK_SIMPLIFY_NO_UPDATE_SAMPLE_FLAGS` - :c:macro:`TSK_SIMPLIFY_REDUCE_TO_SITE_TOPOLOGY` - :c:macro:`TSK_SIMPLIFY_KEEP_UNARY` - :c:macro:`TSK_SIMPLIFY_KEEP_INPUT_ROOTS` - :c:macro:`TSK_SIMPLIFY_KEEP_UNARY_IN_INDIVIDUALS` @endrst @param self A pointer to a tsk_table_collection_t object. @param samples Either NULL or an array of num_samples distinct and valid node IDs. If non-null the nodes in this array will be marked as samples in the output. If NULL, the num_samples parameter is ignored and the samples in the output will be the same as the samples in the input. This is equivalent to populating the samples array with all of the sample nodes in the input in increasing order of ID. @param num_samples The number of node IDs in the input samples array. Ignored if the samples array is NULL. @param options Simplify options; see above for the available bitwise flags. For the default behaviour, a value of 0 should be provided. @param node_map If not NULL, this array will be filled to define the mapping between nodes IDs in the table collection before and after simplification. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_simplify(tsk_table_collection_t *self, const tsk_id_t *samples, tsk_size_t num_samples, tsk_flags_t options, tsk_id_t *node_map); /** @brief Subsets and reorders a table collection according to an array of nodes. @rst Reduces the table collection to contain only the entries referring to the provided list of nodes, with nodes reordered according to the order they appear in the ``nodes`` argument. Specifically, this subsets and reorders each of the tables as follows (but see options, below): 1. Nodes: if in the list of nodes, and in the order provided. 2. Individuals: if referred to by a retained node. 3. Populations: if referred to by a retained node, and in the order first seen when traversing the list of retained nodes. 4. Edges: if both parent and child are retained nodes. 5. Mutations: if the mutation's node is a retained node. 6. Sites: if any mutations remain at the site after removing mutations. Retained individuals, edges, mutations, and sites appear in the same order as in the original tables. Note that only the information *directly* associated with the provided nodes is retained - for instance, subsetting to nodes=[A, B] does not retain nodes ancestral to A and B, and only retains the individuals A and B are in, and not their parents. This function does *not* require the tables to be sorted. .. note:: Migrations are currently not supported by subset, and an error will be raised if we attempt call subset on a table collection with greater than zero migrations. **Options**: Options can be specified by providing one or more of the following bitwise flags: - :c:macro:`TSK_SUBSET_NO_CHANGE_POPULATIONS` - :c:macro:`TSK_SUBSET_KEEP_UNREFERENCED` @endrst @param self A pointer to a tsk_table_collection_t object. @param nodes An array of num_nodes valid node IDs. @param num_nodes The number of node IDs in the input nodes array. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, tsk_size_t num_nodes, tsk_flags_t options); /** @brief Forms the node-wise union of two table collections. @rst Expands this table collection by adding the non-shared portions of another table collection to itself. The ``other_node_mapping`` encodes which nodes in ``other`` are equivalent to a node in ``self``. The positions in the ``other_node_mapping`` array correspond to node ids in ``other``, and the elements encode the equivalent node id in ``self`` or :c:macro:`TSK_NULL` if the node is exclusive to ``other``. Nodes that are exclusive ``other`` are added to ``self``, along with: 1. Individuals which are new to ``self``. 2. Edges whose parent or child are new to ``self``. 3. Sites which were not present in ``self``. 4. Mutations whose nodes are new to ``self``. By default, populations of newly added nodes are assumed to be new populations, and added to the population table as well. The behavior can be changed by the flags ``TSK_UNION_ALL_EDGES`` and ``TSK_UNION_ALL_MUTATIONS``, which will (respectively) add *all* edges or *all* sites and mutations instead. This operation will also sort the resulting tables, so the tables may change even if nothing new is added, if the original tables were not sorted. .. note:: Migrations are currently not supported by union, and an error will be raised if we attempt call union on a table collection with migrations. **Options**: Options can be specified by providing one or more of the following bitwise flags: - :c:macro:`TSK_UNION_NO_CHECK_SHARED` - :c:macro:`TSK_UNION_NO_ADD_POP` @endrst @param self A pointer to a tsk_table_collection_t object. @param other A pointer to a tsk_table_collection_t object. @param other_node_mapping An array of node IDs that relate nodes in other to nodes in self: the k-th element of other_node_mapping should be the index of the equivalent node in self, or TSK_NULL if the node is not present in self (in which case it will be added to self). @param options Union options; see above for the available bitwise flags. For the default behaviour, a value of 0 should be provided. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_union(tsk_table_collection_t *self, const tsk_table_collection_t *other, const tsk_id_t *other_node_mapping, tsk_flags_t options); /** @brief Set the time_units @rst Copies the time_units string to this table collection, replacing any existing. @endrst @param self A pointer to a tsk_table_collection_t object. @param time_units A pointer to a char array. @param time_units_length The size of the time units string in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_set_time_units( tsk_table_collection_t *self, const char *time_units, tsk_size_t time_units_length); /** @brief Set the metadata @rst Copies the metadata string to this table collection, replacing any existing. @endrst @param self A pointer to a tsk_table_collection_t object. @param metadata A pointer to a char array. @param metadata_length The size of the metadata in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_set_metadata( tsk_table_collection_t *self, const char *metadata, tsk_size_t metadata_length); /** @brief Set the metadata schema @rst Copies the metadata schema string to this table collection, replacing any existing. @endrst @param self A pointer to a tsk_table_collection_t object. @param metadata_schema A pointer to a char array. @param metadata_schema_length The size of the metadata schema in bytes. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_set_metadata_schema(tsk_table_collection_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length); /** @brief Returns true if this table collection is indexed. @rst This method returns true if the table collection has an index for the edge table. It guarantees that the index exists, and that it is for the same number of edges that are in the edge table. It does *not* guarantee that the index is valid (i.e., if the rows in the edge have been permuted in some way since the index was built). See the :ref:`sec_c_api_table_indexes` section for details on the index life-cycle. @endrst @param self A pointer to a tsk_table_collection_t object. @param options Bitwise options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return true if there is an index present for this table collection. */ bool tsk_table_collection_has_index( const tsk_table_collection_t *self, tsk_flags_t options); /** @brief Deletes the indexes for this table collection. @rst Unconditionally drop the indexes that may be present for this table collection. It is not an error to call this method on an unindexed table collection. See the :ref:`sec_c_api_table_indexes` section for details on the index life-cycle. @endrst @param self A pointer to a tsk_table_collection_t object. @param options Bitwise options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Always returns 0. */ int tsk_table_collection_drop_index(tsk_table_collection_t *self, tsk_flags_t options); /** @brief Builds indexes for this table collection. @rst Builds the tree traversal :ref:`indexes ` for this table collection. Any existing index is first dropped using :c:func:`tsk_table_collection_drop_index`. See the :ref:`sec_c_api_table_indexes` section for details on the index life-cycle. @endrst @param self A pointer to a tsk_table_collection_t object. @param options Bitwise options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_build_index(tsk_table_collection_t *self, tsk_flags_t options); /** @brief Runs integrity checks on this table collection. @rst Checks the integrity of this table collection. The default checks (i.e., with options = 0) guarantee the integrity of memory and entity references within the table collection. All positions along the genome are checked to see if they are finite values and within the required bounds. Time values are checked to see if they are finite or marked as unknown. Consistency of the direction of inheritance is also checked: whether parents are more recent than children, mutations are not more recent than their nodes or their mutation parents, etcetera. To check if a set of tables fulfills the :ref:`requirements ` needed for a valid tree sequence, use the :c:macro:`TSK_CHECK_TREES` option. When this method is called with :c:macro:`TSK_CHECK_TREES`, the number of trees in the tree sequence is returned. Thus, to check for errors client code should verify that the return value is less than zero. All other options will return zero on success and a negative value on failure. More fine-grained checks can be achieved using bitwise combinations of the other options. **Options**: Options can be specified by providing one or more of the following bitwise flags: - :c:macro:`TSK_CHECK_EDGE_ORDERING` - :c:macro:`TSK_CHECK_SITE_ORDERING` - :c:macro:`TSK_CHECK_SITE_DUPLICATES` - :c:macro:`TSK_CHECK_MUTATION_ORDERING` - :c:macro:`TSK_CHECK_INDIVIDUAL_ORDERING` - :c:macro:`TSK_CHECK_MIGRATION_ORDERING` - :c:macro:`TSK_CHECK_INDEXES` - :c:macro:`TSK_CHECK_TREES` - :c:macro:`TSK_NO_CHECK_POPULATION_REFS` @endrst @param self A pointer to a tsk_table_collection_t object. @param options Bitwise options. @return Return a negative error value on if any problems are detected in the tree sequence. If the TSK_CHECK_TREES option is provided, the number of trees in the tree sequence will be returned, on success. */ tsk_id_t tsk_table_collection_check_integrity( const tsk_table_collection_t *self, tsk_flags_t options); /** @} */ /* Undocumented methods */ /* Flags for ibd_segments */ #define TSK_IBD_STORE_PAIRS (1 << 0) #define TSK_IBD_STORE_SEGMENTS (1 << 1) /* TODO be systematic about where "result" should be in the params * list, different here and in link_ancestors. */ /* FIXME the order of num_samples and samples needs to be reversed in within. * This should be done as part of documenting, I guess. */ int tsk_table_collection_ibd_within(const tsk_table_collection_t *self, tsk_identity_segments_t *result, const tsk_id_t *samples, tsk_size_t num_samples, double min_span, double max_time, tsk_flags_t options); int tsk_table_collection_ibd_between(const tsk_table_collection_t *self, tsk_identity_segments_t *result, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, double min_span, double max_time, tsk_flags_t options); int tsk_table_collection_link_ancestors(tsk_table_collection_t *self, tsk_id_t *samples, tsk_size_t num_samples, tsk_id_t *ancestors, tsk_size_t num_ancestors, tsk_flags_t options, tsk_edge_table_t *result); int tsk_table_collection_deduplicate_sites( tsk_table_collection_t *tables, tsk_flags_t options); int tsk_table_collection_compute_mutation_parents( tsk_table_collection_t *self, tsk_flags_t options); int tsk_table_collection_compute_mutation_times( tsk_table_collection_t *self, double *random, tsk_flags_t options); int tsk_table_collection_delete_older( tsk_table_collection_t *self, double time, tsk_flags_t options); int tsk_table_collection_set_indexes(tsk_table_collection_t *self, tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order); int tsk_table_collection_takeset_metadata( tsk_table_collection_t *self, char *metadata, tsk_size_t metadata_length); int tsk_table_collection_takeset_indexes(tsk_table_collection_t *self, tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order); int tsk_individual_table_takeset_columns(tsk_individual_table_t *self, tsk_size_t num_rows, tsk_flags_t *flags, double *location, tsk_size_t *location_offset, tsk_id_t *parents, tsk_size_t *parents_offset, char *metadata, tsk_size_t *metadata_offset); int tsk_node_table_takeset_columns(tsk_node_table_t *self, tsk_size_t num_rows, tsk_flags_t *flags, double *time, tsk_id_t *population, tsk_id_t *individual, char *metadata, tsk_size_t *metadata_offset); int tsk_edge_table_takeset_columns(tsk_edge_table_t *self, tsk_size_t num_rows, double *left, double *right, tsk_id_t *parent, tsk_id_t *child, char *metadata, tsk_size_t *metadata_offset); int tsk_migration_table_takeset_columns(tsk_migration_table_t *self, tsk_size_t num_rows, double *left, double *right, tsk_id_t *node, tsk_id_t *source, tsk_id_t *dest, double *time, char *metadata, tsk_size_t *metadata_offset); int tsk_site_table_takeset_columns(tsk_site_table_t *self, tsk_size_t num_rows, double *position, char *ancestral_state, tsk_size_t *ancestral_state_offset, char *metadata, tsk_size_t *metadata_offset); int tsk_mutation_table_takeset_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, double *time, char *derived_state, tsk_size_t *derived_state_offset, char *metadata, tsk_size_t *metadata_offset); int tsk_population_table_takeset_columns(tsk_population_table_t *self, tsk_size_t num_rows, char *metadata, tsk_size_t *metadata_offset); int tsk_provenance_table_takeset_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, char *timestamp, tsk_size_t *timestamp_offset, char *record, tsk_size_t *record_offset); bool tsk_table_collection_has_reference_sequence(const tsk_table_collection_t *self); int tsk_reference_sequence_init(tsk_reference_sequence_t *self, tsk_flags_t options); int tsk_reference_sequence_free(tsk_reference_sequence_t *self); bool tsk_reference_sequence_is_null(const tsk_reference_sequence_t *self); bool tsk_reference_sequence_equals(const tsk_reference_sequence_t *self, const tsk_reference_sequence_t *other, tsk_flags_t options); int tsk_reference_sequence_copy(const tsk_reference_sequence_t *self, tsk_reference_sequence_t *dest, tsk_flags_t options); int tsk_reference_sequence_set_data( tsk_reference_sequence_t *self, const char *data, tsk_size_t data_length); int tsk_reference_sequence_set_url( tsk_reference_sequence_t *self, const char *url, tsk_size_t url_length); int tsk_reference_sequence_set_metadata( tsk_reference_sequence_t *self, const char *metadata, tsk_size_t metadata_length); int tsk_reference_sequence_set_metadata_schema(tsk_reference_sequence_t *self, const char *metadata_schema, tsk_size_t metadata_schema_length); int tsk_reference_sequence_takeset_data( tsk_reference_sequence_t *self, char *data, tsk_size_t data_length); int tsk_reference_sequence_takeset_metadata( tsk_reference_sequence_t *self, char *metadata, tsk_size_t metadata_length); /** @defgroup TABLE_SORTER_API_GROUP Low-level table sorter API. @{ */ /* NOTE: We use the "struct _tsk_table_sorter_t" form here * rather then the usual tsk_table_sorter_t alias because * of problems with Doxygen. This was the only way I could * get it to work - ideally, we'd use the usual typedefs * to avoid confusing people. */ /** @brief Initialises the memory for the sorter object. @rst This must be called before any operations are performed on the table sorter and initialises all fields. The ``edge_sort`` function is set to the default method using qsort. The ``user_data`` field is set to NULL. This method supports the same options as :c:func:`tsk_table_collection_sort`. @endrst @param self A pointer to an uninitialised tsk_table_sorter_t object. @param tables The table collection to sort. @param options Sorting options. @return Return 0 on success or a negative value on failure. */ int tsk_table_sorter_init(struct _tsk_table_sorter_t *self, tsk_table_collection_t *tables, tsk_flags_t options); /** @brief Runs the sort using the configured functions. @rst Runs the sorting process: 1. Drop the table indexes. 2. If the ``sort_edges`` function pointer is not NULL, run it. The first parameter to the called function will be a pointer to this table_sorter_t object. The second parameter will be the value ``start.edges``. This specifies the offset at which sorting should start in the edge table. This offset is guaranteed to be within the bounds of the edge table. 3. Sort the site table, building the mapping between site IDs in the current and sorted tables. 4. Sort the mutation table, using the ``sort_mutations`` pointer. If an error occurs during the execution of a user-supplied sorting function a non-zero value must be returned. This value will then be returned by ``tsk_table_sorter_run``. The error return value should be chosen to avoid conflicts with tskit error codes. See :c:func:`tsk_table_collection_sort` for details on the ``start`` parameter. @endrst @param self A pointer to a tsk_table_sorter_t object. @param start The position in the tables at which sorting starts. @return Return 0 on success or a negative value on failure. */ int tsk_table_sorter_run(struct _tsk_table_sorter_t *self, const tsk_bookmark_t *start); /** @brief Free the internal memory for the specified table sorter. @param self A pointer to an initialised tsk_table_sorter_t object. @return Always returns 0. */ int tsk_table_sorter_free(struct _tsk_table_sorter_t *self); /** @} */ int tsk_squash_edges( tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output_edges); /* IBD segments API. This is experimental and the interface may change. */ tsk_size_t tsk_identity_segments_get_num_segments(const tsk_identity_segments_t *self); double tsk_identity_segments_get_total_span(const tsk_identity_segments_t *self); tsk_size_t tsk_identity_segments_get_num_pairs(const tsk_identity_segments_t *self); int tsk_identity_segments_get_keys( const tsk_identity_segments_t *result, tsk_id_t *pairs); int tsk_identity_segments_get_items(const tsk_identity_segments_t *self, tsk_id_t *pairs, tsk_identity_segment_list_t **lists); int tsk_identity_segments_get(const tsk_identity_segments_t *self, tsk_id_t a, tsk_id_t b, tsk_identity_segment_list_t **ret_list); void tsk_identity_segments_print_state(tsk_identity_segments_t *self, FILE *out); int tsk_identity_segments_free(tsk_identity_segments_t *self); #ifdef __cplusplus } #endif #endif ================================================ FILE: c/tskit/trees.c ================================================ /* * MIT License * * Copyright (c) 2019-2025 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include static inline bool is_discrete(double x) { return trunc(x) == x; } /* ======================================================== * * tree sequence * ======================================================== */ static void tsk_treeseq_check_state(const tsk_treeseq_t *self) { tsk_size_t j; tsk_size_t k, l; tsk_site_t site; tsk_id_t site_id = 0; for (j = 0; j < self->num_trees; j++) { for (k = 0; k < self->tree_sites_length[j]; k++) { site = self->tree_sites[j][k]; tsk_bug_assert(site.id == site_id); site_id++; for (l = 0; l < site.mutations_length; l++) { tsk_bug_assert(site.mutations[l].site == site.id); } } } } void tsk_treeseq_print_state(const tsk_treeseq_t *self, FILE *out) { tsk_size_t j; tsk_size_t k, l, m; tsk_site_t site; fprintf(out, "tree_sequence state\n"); fprintf(out, "num_trees = %lld\n", (long long) self->num_trees); fprintf(out, "samples = (%lld)\n", (long long) self->num_samples); for (j = 0; j < self->num_samples; j++) { fprintf(out, "\t%lld\n", (long long) self->samples[j]); } tsk_table_collection_print_state(self->tables, out); fprintf(out, "tree_sites = \n"); for (j = 0; j < self->num_trees; j++) { fprintf(out, "tree %lld\t%lld sites\n", (long long) j, (long long) self->tree_sites_length[j]); for (k = 0; k < self->tree_sites_length[j]; k++) { site = self->tree_sites[j][k]; fprintf(out, "\tsite %lld pos = %f ancestral state = ", (long long) site.id, site.position); for (l = 0; l < site.ancestral_state_length; l++) { fprintf(out, "%c", site.ancestral_state[l]); } fprintf(out, " %lld mutations\n", (long long) site.mutations_length); for (l = 0; l < site.mutations_length; l++) { fprintf(out, "\t\tmutation %lld node = %lld derived_state = ", (long long) site.mutations[l].id, (long long) site.mutations[l].node); for (m = 0; m < site.mutations[l].derived_state_length; m++) { fprintf(out, "%c", site.mutations[l].derived_state[m]); } fprintf(out, "\n"); } } } tsk_treeseq_check_state(self); } int tsk_treeseq_free(tsk_treeseq_t *self) { if (self->tables != NULL) { tsk_table_collection_free(self->tables); } tsk_safe_free(self->tables); tsk_safe_free(self->samples); tsk_safe_free(self->sample_index_map); tsk_safe_free(self->breakpoints); tsk_safe_free(self->tree_sites); tsk_safe_free(self->tree_sites_length); tsk_safe_free(self->tree_sites_mem); tsk_safe_free(self->site_mutations_mem); tsk_safe_free(self->site_mutations_length); tsk_safe_free(self->site_mutations); tsk_safe_free(self->individual_nodes_mem); tsk_safe_free(self->individual_nodes_length); tsk_safe_free(self->individual_nodes); return 0; } static int tsk_treeseq_init_sites(tsk_treeseq_t *self) { tsk_id_t j, k; int ret = 0; tsk_size_t offset = 0; const tsk_size_t num_mutations = self->tables->mutations.num_rows; const tsk_size_t num_sites = self->tables->sites.num_rows; const tsk_id_t *restrict mutation_site = self->tables->mutations.site; const double *restrict site_position = self->tables->sites.position; bool discrete_sites = true; tsk_mutation_t *mutation; self->site_mutations_mem = tsk_malloc(num_mutations * sizeof(*self->site_mutations_mem)); self->site_mutations_length = tsk_malloc(num_sites * sizeof(*self->site_mutations_length)); self->site_mutations = tsk_malloc(num_sites * sizeof(*self->site_mutations)); self->tree_sites_mem = tsk_malloc(num_sites * sizeof(*self->tree_sites_mem)); if (self->site_mutations_mem == NULL || self->site_mutations_length == NULL || self->site_mutations == NULL || self->tree_sites_mem == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (k = 0; k < (tsk_id_t) num_mutations; k++) { mutation = self->site_mutations_mem + k; ret = tsk_treeseq_get_mutation(self, k, mutation); if (ret != 0) { goto out; } } k = 0; for (j = 0; j < (tsk_id_t) num_sites; j++) { discrete_sites = discrete_sites && is_discrete(site_position[j]); self->site_mutations[j] = self->site_mutations_mem + offset; self->site_mutations_length[j] = 0; /* Go through all mutations for this site */ while (k < (tsk_id_t) num_mutations && mutation_site[k] == j) { self->site_mutations_length[j]++; offset++; k++; } ret = tsk_treeseq_get_site(self, j, self->tree_sites_mem + j); if (ret != 0) { goto out; } } self->discrete_genome = self->discrete_genome && discrete_sites; out: return ret; } static int tsk_treeseq_init_individuals(tsk_treeseq_t *self) { int ret = 0; tsk_id_t node; tsk_id_t ind; tsk_size_t offset = 0; tsk_size_t total_node_refs = 0; tsk_size_t *node_count = NULL; tsk_id_t *node_array; const tsk_size_t num_inds = self->tables->individuals.num_rows; const tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_id_t *restrict node_individual = self->tables->nodes.individual; // First find number of nodes per individual self->individual_nodes_length = tsk_calloc(TSK_MAX(1, num_inds), sizeof(*self->individual_nodes_length)); node_count = tsk_calloc(TSK_MAX(1, num_inds), sizeof(*node_count)); if (self->individual_nodes_length == NULL || node_count == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (node = 0; node < (tsk_id_t) num_nodes; node++) { ind = node_individual[node]; if (ind != TSK_NULL) { self->individual_nodes_length[ind]++; total_node_refs++; } } self->individual_nodes_mem = tsk_malloc(TSK_MAX(1, total_node_refs) * sizeof(tsk_node_t)); self->individual_nodes = tsk_malloc(TSK_MAX(1, num_inds) * sizeof(tsk_node_t *)); if (self->individual_nodes_mem == NULL || self->individual_nodes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } /* Now fill in the node IDs */ for (ind = 0; ind < (tsk_id_t) num_inds; ind++) { self->individual_nodes[ind] = self->individual_nodes_mem + offset; offset += self->individual_nodes_length[ind]; } for (node = 0; node < (tsk_id_t) num_nodes; node++) { ind = node_individual[node]; if (ind != TSK_NULL) { node_array = self->individual_nodes[ind]; tsk_bug_assert(node_array - self->individual_nodes_mem < (tsk_id_t) (total_node_refs - node_count[ind])); node_array[node_count[ind]] = node; node_count[ind] += 1; } } out: tsk_safe_free(node_count); return ret; } /* Initialises memory associated with the trees. */ static int tsk_treeseq_init_trees(tsk_treeseq_t *self) { int ret = TSK_ERR_GENERIC; tsk_size_t j, k, tree_index; tsk_id_t site_id, edge_id, mutation_id; double tree_left, tree_right; const double sequence_length = self->tables->sequence_length; const tsk_id_t num_sites = (tsk_id_t) self->tables->sites.num_rows; const tsk_id_t num_mutations = (tsk_id_t) self->tables->mutations.num_rows; const tsk_size_t num_edges = self->tables->edges.num_rows; const tsk_size_t num_nodes = self->tables->nodes.num_rows; const double *restrict site_position = self->tables->sites.position; const tsk_id_t *restrict mutation_site = self->tables->mutations.site; const tsk_id_t *restrict mutation_parent = self->tables->mutations.parent; const char *restrict sites_ancestral_state = self->tables->sites.ancestral_state; const tsk_size_t *restrict sites_ancestral_state_offset = self->tables->sites.ancestral_state_offset; const char *restrict mutations_derived_state = self->tables->mutations.derived_state; const tsk_size_t *restrict mutations_derived_state_offset = self->tables->mutations.derived_state_offset; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_right = self->tables->edges.right; const double *restrict edge_left = self->tables->edges.left; const tsk_id_t *restrict edge_child = self->tables->edges.child; tsk_size_t num_trees_alloc = self->num_trees + 1; bool discrete_breakpoints = true; tsk_id_t *node_edge_map = tsk_malloc(num_nodes * sizeof(*node_edge_map)); tsk_mutation_t *mutation; tsk_id_t parent_id; self->tree_sites_length = tsk_malloc(num_trees_alloc * sizeof(*self->tree_sites_length)); self->tree_sites = tsk_malloc(num_trees_alloc * sizeof(*self->tree_sites)); self->breakpoints = tsk_malloc(num_trees_alloc * sizeof(*self->breakpoints)); if (node_edge_map == NULL || self->tree_sites == NULL || self->tree_sites_length == NULL || self->breakpoints == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset( self->tree_sites_length, 0, self->num_trees * sizeof(*self->tree_sites_length)); tsk_memset(self->tree_sites, 0, self->num_trees * sizeof(*self->tree_sites)); tsk_memset(node_edge_map, TSK_NULL, num_nodes * sizeof(*node_edge_map)); tree_left = 0; tree_right = sequence_length; tree_index = 0; site_id = 0; mutation_id = 0; j = 0; k = 0; while (j < num_edges || tree_left < sequence_length) { discrete_breakpoints = discrete_breakpoints && is_discrete(tree_left); self->breakpoints[tree_index] = tree_left; while (k < num_edges && edge_right[O[k]] == tree_left) { edge_id = O[k]; node_edge_map[edge_child[edge_id]] = TSK_NULL; k++; } while (j < num_edges && edge_left[I[j]] == tree_left) { edge_id = I[j]; node_edge_map[edge_child[edge_id]] = edge_id; j++; } tree_right = sequence_length; if (j < num_edges) { tree_right = TSK_MIN(tree_right, edge_left[I[j]]); } if (k < num_edges) { tree_right = TSK_MIN(tree_right, edge_right[O[k]]); } self->tree_sites[tree_index] = self->tree_sites_mem + site_id; while (site_id < num_sites && site_position[site_id] < tree_right) { self->tree_sites_length[tree_index]++; while ( mutation_id < num_mutations && mutation_site[mutation_id] == site_id) { mutation = self->site_mutations_mem + mutation_id; mutation->edge = node_edge_map[mutation->node]; /* Compute inherited state */ if (mutation_parent[mutation_id] == TSK_NULL) { /* No parent: inherited state is the site's ancestral state */ mutation->inherited_state = sites_ancestral_state + sites_ancestral_state_offset[site_id]; mutation->inherited_state_length = sites_ancestral_state_offset[site_id + 1] - sites_ancestral_state_offset[site_id]; } else { /* Has parent: inherited state is parent's derived state */ parent_id = mutation_parent[mutation_id]; mutation->inherited_state = mutations_derived_state + mutations_derived_state_offset[parent_id]; mutation->inherited_state_length = mutations_derived_state_offset[parent_id + 1] - mutations_derived_state_offset[parent_id]; } mutation_id++; } site_id++; } tree_left = tree_right; tree_index++; } tsk_bug_assert(site_id == num_sites); tsk_bug_assert(tree_index == self->num_trees); self->breakpoints[tree_index] = tree_right; discrete_breakpoints = discrete_breakpoints && is_discrete(tree_right); self->discrete_genome = self->discrete_genome && discrete_breakpoints; ret = 0; out: tsk_safe_free(node_edge_map); return ret; } static void tsk_treeseq_init_migrations(tsk_treeseq_t *self) { tsk_size_t j; tsk_size_t num_migrations = self->tables->migrations.num_rows; const double *restrict left = self->tables->migrations.left; const double *restrict right = self->tables->migrations.right; const double *restrict time = self->tables->migrations.time; bool discrete_breakpoints = true; bool discrete_times = true; for (j = 0; j < num_migrations; j++) { discrete_breakpoints = discrete_breakpoints && is_discrete(left[j]) && is_discrete(right[j]); discrete_times = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); } self->discrete_genome = self->discrete_genome && discrete_breakpoints; self->discrete_time = self->discrete_time && discrete_times; } static void tsk_treeseq_init_mutations(tsk_treeseq_t *self) { tsk_size_t j; tsk_size_t num_mutations = self->tables->mutations.num_rows; const double *restrict time = self->tables->mutations.time; bool discrete_times = true; for (j = 0; j < num_mutations; j++) { discrete_times = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); } self->discrete_time = self->discrete_time && discrete_times; for (j = 0; j < num_mutations; j++) { if (!tsk_is_unknown_time(time[j])) { self->min_time = TSK_MIN(self->min_time, time[j]); self->max_time = TSK_MAX(self->max_time, time[j]); } } } static int tsk_treeseq_init_nodes(tsk_treeseq_t *self) { tsk_size_t j, k; tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_flags_t *restrict node_flags = self->tables->nodes.flags; const double *restrict time = self->tables->nodes.time; int ret = 0; bool discrete_times = true; /* Determine the sample size */ self->num_samples = 0; for (j = 0; j < num_nodes; j++) { if (!!(node_flags[j] & TSK_NODE_IS_SAMPLE)) { self->num_samples++; } } /* TODO raise an error if < 2 samples?? */ self->samples = tsk_malloc(self->num_samples * sizeof(tsk_id_t)); self->sample_index_map = tsk_malloc(num_nodes * sizeof(tsk_id_t)); if (self->samples == NULL || self->sample_index_map == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } k = 0; for (j = 0; j < num_nodes; j++) { self->sample_index_map[j] = -1; if (!!(node_flags[j] & TSK_NODE_IS_SAMPLE)) { self->samples[k] = (tsk_id_t) j; self->sample_index_map[j] = (tsk_id_t) k; k++; } } tsk_bug_assert(k == self->num_samples); for (j = 0; j < num_nodes; j++) { discrete_times = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); } self->discrete_time = self->discrete_time && discrete_times; for (j = 0; j < num_nodes; j++) { if (!tsk_is_unknown_time(time[j])) { self->min_time = TSK_MIN(self->min_time, time[j]); self->max_time = TSK_MAX(self->max_time, time[j]); } } out: return ret; } int TSK_WARN_UNUSED tsk_treeseq_init( tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options) { int ret = 0; tsk_id_t num_trees; tsk_memset(self, 0, sizeof(*self)); if (options & TSK_TAKE_OWNERSHIP) { self->tables = tables; if (tables->edges.options & TSK_TABLE_NO_METADATA) { ret = tsk_trace_error(TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA); goto out; } } else { self->tables = tsk_malloc(sizeof(*self->tables)); if (self->tables == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } /* Note that this copy reinstates metadata for a table collection with * TSK_TC_NO_EDGE_METADATA. Otherwise a table without metadata would * crash tsk_diff_iter_next. */ ret = tsk_table_collection_copy(tables, self->tables, TSK_COPY_FILE_UUID); if (ret != 0) { goto out; } } if (options & TSK_TS_INIT_BUILD_INDEXES) { ret = tsk_table_collection_build_index(self->tables, 0); if (ret != 0) { goto out; } } if (options & TSK_TS_INIT_COMPUTE_MUTATION_PARENTS) { /* As tsk_table_collection_compute_mutation_parents performs an integrity check, and we don't wish to do that twice we perform our own check here */ num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); if (num_trees < 0) { ret = (int) num_trees; goto out; } ret = tsk_table_collection_compute_mutation_parents( self->tables, TSK_NO_CHECK_INTEGRITY); if (ret != 0) { goto out; } } else { num_trees = tsk_table_collection_check_integrity( self->tables, TSK_CHECK_TREES | TSK_CHECK_MUTATION_PARENTS); if (num_trees < 0) { ret = (int) num_trees; goto out; } } self->num_trees = (tsk_size_t) num_trees; self->discrete_genome = true; self->discrete_time = true; self->min_time = INFINITY; self->max_time = -INFINITY; ret = tsk_treeseq_init_nodes(self); if (ret != 0) { goto out; } ret = tsk_treeseq_init_sites(self); if (ret != 0) { goto out; } ret = tsk_treeseq_init_individuals(self); if (ret != 0) { goto out; } ret = tsk_treeseq_init_trees(self); if (ret != 0) { goto out; } tsk_treeseq_init_migrations(self); tsk_treeseq_init_mutations(self); if (tsk_treeseq_get_time_units_length(self) == strlen(TSK_TIME_UNITS_UNCALIBRATED) && !strncmp(tsk_treeseq_get_time_units(self), TSK_TIME_UNITS_UNCALIBRATED, strlen(TSK_TIME_UNITS_UNCALIBRATED))) { self->time_uncalibrated = true; } out: return ret; } int TSK_WARN_UNUSED tsk_treeseq_copy_tables( const tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options) { return tsk_table_collection_copy(self->tables, tables, options); } int TSK_WARN_UNUSED tsk_treeseq_load(tsk_treeseq_t *self, const char *filename, tsk_flags_t options) { int ret = 0; tsk_table_collection_t *tables = malloc(sizeof(*tables)); /* Need to make sure that we're zero'd out in case of error */ tsk_memset(self, 0, sizeof(*self)); if (tables == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_table_collection_load(tables, filename, options); if (ret != 0) { tsk_table_collection_free(tables); tsk_safe_free(tables); goto out; } /* TSK_TAKE_OWNERSHIP takes immediate ownership of the tables, regardless * of error conditions. */ ret = tsk_treeseq_init(self, tables, TSK_TAKE_OWNERSHIP); if (ret != 0) { goto out; } out: return ret; } int TSK_WARN_UNUSED tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t options) { int ret = 0; tsk_table_collection_t *tables = malloc(sizeof(*tables)); /* Need to make sure that we're zero'd out in case of error */ tsk_memset(self, 0, sizeof(*self)); if (tables == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_table_collection_loadf(tables, file, options); if (ret != 0) { tsk_table_collection_free(tables); tsk_safe_free(tables); goto out; } /* TSK_TAKE_OWNERSHIP takes immediate ownership of the tables, regardless * of error conditions. */ ret = tsk_treeseq_init(self, tables, TSK_TAKE_OWNERSHIP); if (ret != 0) { goto out; } out: return ret; } int TSK_WARN_UNUSED tsk_treeseq_dump(const tsk_treeseq_t *self, const char *filename, tsk_flags_t options) { return tsk_table_collection_dump(self->tables, filename, options); } int TSK_WARN_UNUSED tsk_treeseq_dumpf(const tsk_treeseq_t *self, FILE *file, tsk_flags_t options) { return tsk_table_collection_dumpf(self->tables, file, options); } /* Simple attribute getters */ const char * tsk_treeseq_get_metadata(const tsk_treeseq_t *self) { return self->tables->metadata; } tsk_size_t tsk_treeseq_get_metadata_length(const tsk_treeseq_t *self) { return self->tables->metadata_length; } const char * tsk_treeseq_get_metadata_schema(const tsk_treeseq_t *self) { return self->tables->metadata_schema; } tsk_size_t tsk_treeseq_get_metadata_schema_length(const tsk_treeseq_t *self) { return self->tables->metadata_schema_length; } const char * tsk_treeseq_get_time_units(const tsk_treeseq_t *self) { return self->tables->time_units; } tsk_size_t tsk_treeseq_get_time_units_length(const tsk_treeseq_t *self) { return self->tables->time_units_length; } double tsk_treeseq_get_sequence_length(const tsk_treeseq_t *self) { return self->tables->sequence_length; } const char * tsk_treeseq_get_file_uuid(const tsk_treeseq_t *self) { return self->tables->file_uuid; } tsk_size_t tsk_treeseq_get_num_samples(const tsk_treeseq_t *self) { return self->num_samples; } tsk_size_t tsk_treeseq_get_num_nodes(const tsk_treeseq_t *self) { return self->tables->nodes.num_rows; } tsk_size_t tsk_treeseq_get_num_edges(const tsk_treeseq_t *self) { return self->tables->edges.num_rows; } tsk_size_t tsk_treeseq_get_num_migrations(const tsk_treeseq_t *self) { return self->tables->migrations.num_rows; } tsk_size_t tsk_treeseq_get_num_sites(const tsk_treeseq_t *self) { return self->tables->sites.num_rows; } tsk_size_t tsk_treeseq_get_num_mutations(const tsk_treeseq_t *self) { return self->tables->mutations.num_rows; } tsk_size_t tsk_treeseq_get_num_populations(const tsk_treeseq_t *self) { return self->tables->populations.num_rows; } tsk_size_t tsk_treeseq_get_num_individuals(const tsk_treeseq_t *self) { return self->tables->individuals.num_rows; } tsk_size_t tsk_treeseq_get_num_provenances(const tsk_treeseq_t *self) { return self->tables->provenances.num_rows; } tsk_size_t tsk_treeseq_get_num_trees(const tsk_treeseq_t *self) { return self->num_trees; } const double * tsk_treeseq_get_breakpoints(const tsk_treeseq_t *self) { return self->breakpoints; } const tsk_id_t * tsk_treeseq_get_samples(const tsk_treeseq_t *self) { return self->samples; } const tsk_id_t * tsk_treeseq_get_sample_index_map(const tsk_treeseq_t *self) { return self->sample_index_map; } bool tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u) { bool ret = false; if (u >= 0 && u < (tsk_id_t) self->tables->nodes.num_rows) { ret = !!(self->tables->nodes.flags[u] & TSK_NODE_IS_SAMPLE); } return ret; } bool tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self) { return self->discrete_genome; } bool tsk_treeseq_get_discrete_time(const tsk_treeseq_t *self) { return self->discrete_time; } double tsk_treeseq_get_min_time(const tsk_treeseq_t *self) { return self->min_time; } double tsk_treeseq_get_max_time(const tsk_treeseq_t *self) { return self->max_time; } bool tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self) { return tsk_table_collection_has_reference_sequence(self->tables); } int tsk_treeseq_get_individuals_population(const tsk_treeseq_t *self, tsk_id_t *output) { int ret = 0; tsk_size_t i, j; tsk_individual_t ind; tsk_id_t ind_pop; const tsk_id_t *node_population = self->tables->nodes.population; const tsk_size_t num_individuals = self->tables->individuals.num_rows; tsk_memset(output, TSK_NULL, num_individuals * sizeof(*output)); for (i = 0; i < num_individuals; i++) { ret = tsk_treeseq_get_individual(self, (tsk_id_t) i, &ind); tsk_bug_assert(ret == 0); if (ind.nodes_length > 0) { ind_pop = -2; for (j = 0; j < ind.nodes_length; j++) { if (ind_pop == -2) { ind_pop = node_population[ind.nodes[j]]; } else if (ind_pop != node_population[ind.nodes[j]]) { ret = tsk_trace_error(TSK_ERR_INDIVIDUAL_POPULATION_MISMATCH); goto out; } } output[ind.id] = ind_pop; } } out: return ret; } int tsk_treeseq_get_individuals_time(const tsk_treeseq_t *self, double *output) { int ret = 0; tsk_size_t i, j; tsk_individual_t ind; double ind_time; const double *node_time = self->tables->nodes.time; const tsk_size_t num_individuals = self->tables->individuals.num_rows; for (i = 0; i < num_individuals; i++) { ret = tsk_treeseq_get_individual(self, (tsk_id_t) i, &ind); tsk_bug_assert(ret == 0); /* the default is UNKNOWN_TIME, but nodes cannot have * UNKNOWN _TIME so this is safe. */ ind_time = TSK_UNKNOWN_TIME; for (j = 0; j < ind.nodes_length; j++) { if (j == 0) { ind_time = node_time[ind.nodes[j]]; } else if (ind_time != node_time[ind.nodes[j]]) { ret = tsk_trace_error(TSK_ERR_INDIVIDUAL_TIME_MISMATCH); goto out; } } output[ind.id] = ind_time; } out: return ret; } /* Stats functions */ #define GET_2D_ROW(array, row_len, row) (array + (((size_t) (row_len)) * (size_t) (row))) static inline double * GET_3D_ROW(double *base, tsk_size_t num_nodes, tsk_size_t output_dim, tsk_size_t window_index, tsk_id_t u) { tsk_size_t offset = window_index * num_nodes * output_dim + ((tsk_size_t) u) * output_dim; return base + offset; } /* Increments the n-dimensional array with the specified shape by the specified value at * the specified coordinate. */ static inline void increment_nd_array_value(double *array, tsk_size_t n, const tsk_size_t *shape, const tsk_size_t *coordinate, double value) { tsk_size_t offset = 0; tsk_size_t product = 1; int k; for (k = (int) n - 1; k >= 0; k--) { tsk_bug_assert(coordinate[k] < shape[k]); offset += coordinate[k] * product; product *= shape[k]; } array[offset] += value; } /* TODO flatten the reference sets input here and follow the same pattern used * in diversity, divergence, etc. */ int TSK_WARN_UNUSED tsk_treeseq_genealogical_nearest_neighbours(const tsk_treeseq_t *self, const tsk_id_t *focal, tsk_size_t num_focal, const tsk_id_t *const *reference_sets, const tsk_size_t *reference_set_size, tsk_size_t num_reference_sets, tsk_flags_t TSK_UNUSED(options), double *ret_array) { int ret = 0; tsk_id_t u, v, p; tsk_size_t j; /* TODO It's probably not worth bothering with the int16_t here. */ int16_t k, focal_reference_set; /* We use the K'th element of the array for the total. */ const int16_t K = (int16_t) (num_reference_sets + 1); tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_left = self->tables->edges.left; const double *restrict edge_right = self->tables->edges.right; const tsk_id_t *restrict edge_parent = self->tables->edges.parent; const tsk_id_t *restrict edge_child = self->tables->edges.child; const double sequence_length = self->tables->sequence_length; tsk_id_t tj, tk, h; double left, right, *A_row, scale, tree_length; tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); double *restrict length = tsk_calloc(num_focal, sizeof(*length)); uint32_t *restrict ref_count = tsk_calloc(((tsk_size_t) K) * num_nodes, sizeof(*ref_count)); int16_t *restrict reference_set_map = tsk_malloc(num_nodes * sizeof(*reference_set_map)); uint32_t *restrict row = NULL; uint32_t *restrict child_row = NULL; uint32_t total, delta; /* We support a max of 8K focal sets */ if (num_reference_sets == 0 || num_reference_sets > (INT16_MAX - 1)) { /* TODO: more specific error */ ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if (parent == NULL || ref_count == NULL || reference_set_map == NULL || length == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); tsk_memset(reference_set_map, 0xff, num_nodes * sizeof(*reference_set_map)); tsk_memset(ret_array, 0, num_focal * num_reference_sets * sizeof(*ret_array)); total = 0; /* keep the compiler happy */ /* Set the initial conditions and check the input. */ for (k = 0; k < (int16_t) num_reference_sets; k++) { for (j = 0; j < reference_set_size[k]; j++) { u = reference_sets[k][j]; if (u < 0 || u >= (tsk_id_t) num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (reference_set_map[u] != TSK_NULL) { /* FIXME Technically inaccurate here: duplicate focal not sample */ ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } reference_set_map[u] = k; row = GET_2D_ROW(ref_count, K, u); row[k] = 1; /* Also set the count for the total among all sets */ row[K - 1] = 1; } } for (j = 0; j < num_focal; j++) { u = focal[j]; if (u < 0 || u >= (tsk_id_t) num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } } /* Iterate over the trees */ tj = 0; tk = 0; left = 0; while (tj < num_edges || left < sequence_length) { while (tk < num_edges && edge_right[O[tk]] == left) { h = O[tk]; tk++; u = edge_child[h]; v = edge_parent[h]; parent[u] = TSK_NULL; child_row = GET_2D_ROW(ref_count, K, u); while (v != TSK_NULL) { row = GET_2D_ROW(ref_count, K, v); for (k = 0; k < K; k++) { row[k] -= child_row[k]; } v = parent[v]; } } while (tj < num_edges && edge_left[I[tj]] == left) { h = I[tj]; tj++; u = edge_child[h]; v = edge_parent[h]; parent[u] = v; child_row = GET_2D_ROW(ref_count, K, u); while (v != TSK_NULL) { row = GET_2D_ROW(ref_count, K, v); for (k = 0; k < K; k++) { row[k] += child_row[k]; } v = parent[v]; } } right = sequence_length; if (tj < num_edges) { right = TSK_MIN(right, edge_left[I[tj]]); } if (tk < num_edges) { right = TSK_MIN(right, edge_right[O[tk]]); } tree_length = right - left; /* Process this tree */ for (j = 0; j < num_focal; j++) { u = focal[j]; focal_reference_set = reference_set_map[u]; delta = focal_reference_set != -1; p = u; while (p != TSK_NULL) { row = GET_2D_ROW(ref_count, K, p); total = row[K - 1]; if (total > delta) { break; } p = parent[p]; } if (p != TSK_NULL) { length[j] += tree_length; scale = tree_length / (total - delta); A_row = GET_2D_ROW(ret_array, num_reference_sets, j); for (k = 0; k < K - 1; k++) { A_row[k] += row[k] * scale; } if (focal_reference_set != -1) { /* Remove the contribution for the reference set u belongs to and * insert the correct value. The long-hand version is * A_row[k] = A_row[k] - row[k] * scale + (row[k] - 1) * scale; * which cancels to give: */ A_row[focal_reference_set] -= scale; } } } /* Move on to the next tree */ left = right; } /* Divide by the accumulated length for each node to normalise */ for (j = 0; j < num_focal; j++) { A_row = GET_2D_ROW(ret_array, num_reference_sets, j); if (length[j] > 0) { for (k = 0; k < K - 1; k++) { A_row[k] /= length[j]; } } } out: /* Can't use msp_safe_free here because of restrict */ if (parent != NULL) { free(parent); } if (ref_count != NULL) { free(ref_count); } if (reference_set_map != NULL) { free(reference_set_map); } if (length != NULL) { free(length); } return ret; } int TSK_WARN_UNUSED tsk_treeseq_mean_descendants(const tsk_treeseq_t *self, const tsk_id_t *const *reference_sets, const tsk_size_t *reference_set_size, tsk_size_t num_reference_sets, tsk_flags_t TSK_UNUSED(options), double *ret_array) { int ret = 0; tsk_id_t u, v; tsk_size_t j; int32_t k; /* We use the K'th element of the array for the total. */ const int32_t K = (int32_t) (num_reference_sets + 1); tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_left = self->tables->edges.left; const double *restrict edge_right = self->tables->edges.right; const tsk_id_t *restrict edge_parent = self->tables->edges.parent; const tsk_id_t *restrict edge_child = self->tables->edges.child; const double sequence_length = self->tables->sequence_length; tsk_id_t tj, tk, h; double left, right, length, *restrict C_row; tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); uint32_t *restrict ref_count = tsk_calloc(num_nodes * ((size_t) K), sizeof(*ref_count)); double *restrict last_update = tsk_calloc(num_nodes, sizeof(*last_update)); double *restrict total_length = tsk_calloc(num_nodes, sizeof(*total_length)); uint32_t *restrict row, *restrict child_row; if (num_reference_sets == 0 || num_reference_sets > (INT32_MAX - 1)) { /* TODO: more specific error */ ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } if (parent == NULL || ref_count == NULL || last_update == NULL || total_length == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } /* TODO add check for duplicate values in the reference sets */ tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); tsk_memset(ret_array, 0, num_nodes * num_reference_sets * sizeof(*ret_array)); /* Set the initial conditions and check the input. */ for (k = 0; k < (int32_t) num_reference_sets; k++) { for (j = 0; j < reference_set_size[k]; j++) { u = reference_sets[k][j]; if (u < 0 || u >= (tsk_id_t) num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } row = GET_2D_ROW(ref_count, K, u); row[k] = 1; /* Also set the count for the total among all sets */ row[K - 1] = 1; } } /* Iterate over the trees */ tj = 0; tk = 0; left = 0; while (tj < num_edges || left < sequence_length) { while (tk < num_edges && edge_right[O[tk]] == left) { h = O[tk]; tk++; u = edge_child[h]; v = edge_parent[h]; parent[u] = TSK_NULL; child_row = GET_2D_ROW(ref_count, K, u); while (v != TSK_NULL) { row = GET_2D_ROW(ref_count, K, v); if (last_update[v] != left) { if (row[K - 1] > 0) { length = left - last_update[v]; C_row = GET_2D_ROW(ret_array, num_reference_sets, v); for (k = 0; k < (int32_t) num_reference_sets; k++) { C_row[k] += length * row[k]; } total_length[v] += length; } last_update[v] = left; } for (k = 0; k < K; k++) { row[k] -= child_row[k]; } v = parent[v]; } } while (tj < num_edges && edge_left[I[tj]] == left) { h = I[tj]; tj++; u = edge_child[h]; v = edge_parent[h]; parent[u] = v; child_row = GET_2D_ROW(ref_count, K, u); while (v != TSK_NULL) { row = GET_2D_ROW(ref_count, K, v); if (last_update[v] != left) { if (row[K - 1] > 0) { length = left - last_update[v]; C_row = GET_2D_ROW(ret_array, num_reference_sets, v); for (k = 0; k < (int32_t) num_reference_sets; k++) { C_row[k] += length * row[k]; } total_length[v] += length; } last_update[v] = left; } for (k = 0; k < K; k++) { row[k] += child_row[k]; } v = parent[v]; } } right = sequence_length; if (tj < num_edges) { right = TSK_MIN(right, edge_left[I[tj]]); } if (tk < num_edges) { right = TSK_MIN(right, edge_right[O[tk]]); } left = right; } /* Add the stats for the last tree and divide by the total length that * each node was an ancestor to > 0 of the reference nodes. */ for (v = 0; v < (tsk_id_t) num_nodes; v++) { row = GET_2D_ROW(ref_count, K, v); C_row = GET_2D_ROW(ret_array, num_reference_sets, v); if (row[K - 1] > 0) { length = sequence_length - last_update[v]; total_length[v] += length; for (k = 0; k < (int32_t) num_reference_sets; k++) { C_row[k] += length * row[k]; } } if (total_length[v] > 0) { length = total_length[v]; for (k = 0; k < (int32_t) num_reference_sets; k++) { C_row[k] /= length; } } } out: /* Can't use msp_safe_free here because of restrict */ if (parent != NULL) { free(parent); } if (ref_count != NULL) { free(ref_count); } if (last_update != NULL) { free(last_update); } if (total_length != NULL) { free(total_length); } return ret; } /*********************************** * General stats framework ***********************************/ #define TSK_REQUIRE_FULL_SPAN 1 static int tsk_treeseq_check_windows(const tsk_treeseq_t *self, tsk_size_t num_windows, const double *windows, tsk_flags_t options) { int ret = 0; tsk_size_t j; if (num_windows < 1) { ret = tsk_trace_error(TSK_ERR_BAD_NUM_WINDOWS); goto out; } if (options & TSK_REQUIRE_FULL_SPAN) { /* TODO the general stat code currently requires that we include the * entire tree sequence span. This should be relaxed, so hopefully * this branch (and the option) can be removed at some point */ if (windows[0] != 0) { ret = tsk_trace_error(TSK_ERR_BAD_WINDOWS); goto out; } if (windows[num_windows] != self->tables->sequence_length) { ret = tsk_trace_error(TSK_ERR_BAD_WINDOWS); goto out; } } else { if (windows[0] < 0) { ret = tsk_trace_error(TSK_ERR_BAD_WINDOWS); goto out; } if (windows[num_windows] > self->tables->sequence_length) { ret = tsk_trace_error(TSK_ERR_BAD_WINDOWS); goto out; } } for (j = 0; j < num_windows; j++) { if (windows[j] >= windows[j + 1]) { ret = tsk_trace_error(TSK_ERR_BAD_WINDOWS); goto out; } } ret = 0; out: return ret; } static int tsk_treeseq_check_time_windows(tsk_size_t num_windows, const double *windows) { // This does not check the last window ends at infinity, // which is required for some time window functions. int ret = TSK_ERR_BAD_TIME_WINDOWS; tsk_size_t j; if (num_windows < 1) { ret = TSK_ERR_BAD_TIME_WINDOWS_DIM; goto out; } if (windows[0] != 0.0) { goto out; } for (j = 0; j < num_windows; j++) { if (windows[j] >= windows[j + 1]) { goto out; } } ret = 0; out: return ret; } /* TODO make these functions more consistent in how the arguments are ordered */ static inline void update_state(double *X, tsk_size_t state_dim, tsk_id_t dest, tsk_id_t source, int sign) { tsk_size_t k; double *X_dest = GET_2D_ROW(X, state_dim, dest); double *X_source = GET_2D_ROW(X, state_dim, source); for (k = 0; k < state_dim; k++) { X_dest[k] += sign * X_source[k]; } } static inline int update_node_summary(tsk_id_t u, tsk_size_t result_dim, double *node_summary, double *X, tsk_size_t state_dim, general_stat_func_t *f, void *f_params) { double *X_u = GET_2D_ROW(X, state_dim, u); double *summary_u = GET_2D_ROW(node_summary, result_dim, u); return f(state_dim, X_u, result_dim, summary_u, f_params); } static inline void update_running_sum(tsk_id_t u, double sign, const double *restrict branch_length, const double *summary, tsk_size_t result_dim, double *running_sum) { const double *summary_u = GET_2D_ROW(summary, result_dim, u); const double x = sign * branch_length[u]; tsk_size_t m; for (m = 0; m < result_dim; m++) { running_sum[m] += x * summary_u[m]; } } static int tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; tsk_id_t u, v; tsk_size_t j, k, window_index; tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_left = self->tables->edges.left; const double *restrict edge_right = self->tables->edges.right; const tsk_id_t *restrict edge_parent = self->tables->edges.parent; const tsk_id_t *restrict edge_child = self->tables->edges.child; const double *restrict time = self->tables->nodes.time; const double sequence_length = self->tables->sequence_length; tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); double *restrict branch_length = tsk_calloc(num_nodes, sizeof(*branch_length)); tsk_id_t tj, tk, h; double t_left, t_right, w_left, w_right, left, right, scale; const double *weight_u; double *state_u, *result_row, *summary_u; double *state = tsk_calloc(num_nodes * state_dim, sizeof(*state)); double *summary = tsk_calloc(num_nodes * result_dim, sizeof(*summary)); double *running_sum = tsk_calloc(result_dim, sizeof(*running_sum)); double *zero_state = tsk_calloc(state_dim, sizeof(*zero_state)); double *zero_summary = tsk_calloc(result_dim, sizeof(*zero_state)); if (self->time_uncalibrated && !(options & TSK_STAT_ALLOW_TIME_UNCALIBRATED)) { ret = tsk_trace_error(TSK_ERR_TIME_UNCALIBRATED); goto out; } if (parent == NULL || branch_length == NULL || state == NULL || running_sum == NULL || summary == NULL || zero_state == NULL || zero_summary == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); /* If f is not strict, we may need to set conditions for non-sample nodes as well. */ ret = f(state_dim, zero_state, result_dim, zero_summary, f_params); if (ret != 0) { goto out; } for (j = 0; j < num_nodes; j++) { // we could skip this if zero_summary is zero summary_u = GET_2D_ROW(summary, result_dim, j); tsk_memcpy(summary_u, zero_summary, result_dim * sizeof(*zero_summary)); } /* Set the initial conditions */ for (j = 0; j < self->num_samples; j++) { u = self->samples[j]; state_u = GET_2D_ROW(state, state_dim, u); weight_u = GET_2D_ROW(sample_weights, state_dim, j); tsk_memcpy(state_u, weight_u, state_dim * sizeof(*state_u)); summary_u = GET_2D_ROW(summary, result_dim, u); ret = f(state_dim, state_u, result_dim, summary_u, f_params); if (ret != 0) { goto out; } } tsk_memset(result, 0, num_windows * result_dim * sizeof(*result)); /* Iterate over the trees */ tj = 0; tk = 0; t_left = 0; window_index = 0; while (tj < num_edges || t_left < sequence_length) { while (tk < num_edges && edge_right[O[tk]] == t_left) { h = O[tk]; tk++; u = edge_child[h]; update_running_sum(u, -1, branch_length, summary, result_dim, running_sum); parent[u] = TSK_NULL; branch_length[u] = 0; u = edge_parent[h]; while (u != TSK_NULL) { update_running_sum( u, -1, branch_length, summary, result_dim, running_sum); update_state(state, state_dim, u, edge_child[h], -1); ret = update_node_summary( u, result_dim, summary, state, state_dim, f, f_params); if (ret != 0) { goto out; } update_running_sum( u, +1, branch_length, summary, result_dim, running_sum); u = parent[u]; } } while (tj < num_edges && edge_left[I[tj]] == t_left) { h = I[tj]; tj++; u = edge_child[h]; v = edge_parent[h]; parent[u] = v; branch_length[u] = time[v] - time[u]; update_running_sum(u, +1, branch_length, summary, result_dim, running_sum); u = v; while (u != TSK_NULL) { update_running_sum( u, -1, branch_length, summary, result_dim, running_sum); update_state(state, state_dim, u, edge_child[h], +1); ret = update_node_summary( u, result_dim, summary, state, state_dim, f, f_params); if (ret != 0) { goto out; } update_running_sum( u, +1, branch_length, summary, result_dim, running_sum); u = parent[u]; } } t_right = sequence_length; if (tj < num_edges) { t_right = TSK_MIN(t_right, edge_left[I[tj]]); } if (tk < num_edges) { t_right = TSK_MIN(t_right, edge_right[O[tk]]); } while (windows[window_index] < t_right) { tsk_bug_assert(window_index < num_windows); w_left = windows[window_index]; w_right = windows[window_index + 1]; left = TSK_MAX(t_left, w_left); right = TSK_MIN(t_right, w_right); scale = (right - left); tsk_bug_assert(scale > 0); result_row = GET_2D_ROW(result, result_dim, window_index); for (k = 0; k < result_dim; k++) { result_row[k] += running_sum[k] * scale; } if (w_right <= t_right) { window_index++; } else { /* This interval crosses a tree boundary, so we update it again in the */ /* for the next tree */ break; } } /* Move to the next tree */ t_left = t_right; } tsk_bug_assert(window_index == num_windows); out: /* Can't use msp_safe_free here because of restrict */ if (parent != NULL) { free(parent); } if (branch_length != NULL) { free(branch_length); } tsk_safe_free(state); tsk_safe_free(summary); tsk_safe_free(running_sum); tsk_safe_free(zero_state); tsk_safe_free(zero_summary); return ret; } static int get_allele_weights(const tsk_site_t *site, const double *state, tsk_size_t state_dim, const double *total_weight, tsk_size_t *ret_num_alleles, double **ret_allele_states) { int ret = 0; tsk_size_t k; tsk_mutation_t mutation, parent_mut; tsk_size_t mutation_index, allele, num_alleles, alt_allele_length; /* The allele table */ tsk_size_t max_alleles = site->mutations_length + 1; const char **alleles = tsk_malloc(max_alleles * sizeof(*alleles)); tsk_size_t *allele_lengths = tsk_calloc(max_alleles, sizeof(*allele_lengths)); double *allele_states = tsk_calloc(max_alleles * state_dim, sizeof(*allele_states)); double *allele_row; const double *state_row; const char *alt_allele; if (alleles == NULL || allele_lengths == NULL || allele_states == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_bug_assert(state != NULL); alleles[0] = site->ancestral_state; allele_lengths[0] = site->ancestral_state_length; tsk_memcpy(allele_states, total_weight, state_dim * sizeof(*allele_states)); num_alleles = 1; for (mutation_index = 0; mutation_index < site->mutations_length; mutation_index++) { mutation = site->mutations[mutation_index]; /* Compute the allele index for this derived state value. */ allele = 0; while (allele < num_alleles) { if (mutation.derived_state_length == allele_lengths[allele] && tsk_memcmp( mutation.derived_state, alleles[allele], allele_lengths[allele]) == 0) { break; } allele++; } if (allele == num_alleles) { tsk_bug_assert(allele < max_alleles); alleles[allele] = mutation.derived_state; allele_lengths[allele] = mutation.derived_state_length; num_alleles++; } /* Add the state for the the mutation's node to this allele */ state_row = GET_2D_ROW(state, state_dim, mutation.node); allele_row = GET_2D_ROW(allele_states, state_dim, allele); for (k = 0; k < state_dim; k++) { allele_row[k] += state_row[k]; } /* Get the index for the alternate allele that we must subtract from */ alt_allele = site->ancestral_state; alt_allele_length = site->ancestral_state_length; if (mutation.parent != TSK_NULL) { parent_mut = site->mutations[mutation.parent - site->mutations[0].id]; alt_allele = parent_mut.derived_state; alt_allele_length = parent_mut.derived_state_length; } allele = 0; while (allele < num_alleles) { if (alt_allele_length == allele_lengths[allele] && tsk_memcmp(alt_allele, alleles[allele], allele_lengths[allele]) == 0) { break; } allele++; } tsk_bug_assert(allele < num_alleles); allele_row = GET_2D_ROW(allele_states, state_dim, allele); for (k = 0; k < state_dim; k++) { allele_row[k] -= state_row[k]; } } *ret_num_alleles = num_alleles; *ret_allele_states = allele_states; allele_states = NULL; out: tsk_safe_free(alleles); tsk_safe_free(allele_lengths); tsk_safe_free(allele_states); return ret; } static int compute_general_stat_site_result(tsk_site_t *site, double *state, tsk_size_t state_dim, tsk_size_t result_dim, general_stat_func_t *f, void *f_params, double *total_weight, bool polarised, double *result) { int ret = 0; tsk_size_t k; tsk_size_t allele, num_alleles; double *allele_states; double *result_tmp = tsk_calloc(result_dim, sizeof(*result_tmp)); if (result_tmp == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(result, 0, result_dim * sizeof(*result)); ret = get_allele_weights( site, state, state_dim, total_weight, &num_alleles, &allele_states); if (ret != 0) { goto out; } /* Sum over the allele weights. Skip the ancestral state if this is a polarised stat */ for (allele = polarised ? 1 : 0; allele < num_alleles; allele++) { ret = f(state_dim, GET_2D_ROW(allele_states, state_dim, allele), result_dim, result_tmp, f_params); if (ret != 0) { goto out; } for (k = 0; k < result_dim; k++) { result[k] += result_tmp[k]; } } out: tsk_safe_free(result_tmp); tsk_safe_free(allele_states); return ret; } static int tsk_treeseq_site_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; tsk_id_t u, v; tsk_size_t j, k, tree_site, tree_index, window_index; tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_left = self->tables->edges.left; const double *restrict edge_right = self->tables->edges.right; const tsk_id_t *restrict edge_parent = self->tables->edges.parent; const tsk_id_t *restrict edge_child = self->tables->edges.child; const double sequence_length = self->tables->sequence_length; tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); tsk_site_t *site; tsk_id_t tj, tk, h; double t_left, t_right; const double *weight_u; double *state_u, *result_row; double *state = tsk_calloc(num_nodes * state_dim, sizeof(*state)); double *total_weight = tsk_calloc(state_dim, sizeof(*total_weight)); double *site_result = tsk_calloc(result_dim, sizeof(*site_result)); bool polarised = false; if (parent == NULL || state == NULL || total_weight == NULL || site_result == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); if (options & TSK_STAT_POLARISED) { polarised = true; } /* Set the initial conditions */ for (j = 0; j < self->num_samples; j++) { u = self->samples[j]; state_u = GET_2D_ROW(state, state_dim, u); weight_u = GET_2D_ROW(sample_weights, state_dim, j); tsk_memcpy(state_u, weight_u, state_dim * sizeof(*state_u)); for (k = 0; k < state_dim; k++) { total_weight[k] += weight_u[k]; } } tsk_memset(result, 0, num_windows * result_dim * sizeof(*result)); /* Iterate over the trees */ tj = 0; tk = 0; t_left = 0; tree_index = 0; window_index = 0; while (tj < num_edges || t_left < sequence_length) { while (tk < num_edges && edge_right[O[tk]] == t_left) { h = O[tk]; tk++; u = edge_child[h]; v = edge_parent[h]; while (v != TSK_NULL) { update_state(state, state_dim, v, u, -1); v = parent[v]; } parent[u] = TSK_NULL; } while (tj < num_edges && edge_left[I[tj]] == t_left) { h = I[tj]; tj++; u = edge_child[h]; v = edge_parent[h]; parent[u] = v; while (v != TSK_NULL) { update_state(state, state_dim, v, u, +1); v = parent[v]; } } t_right = sequence_length; if (tj < num_edges) { t_right = TSK_MIN(t_right, edge_left[I[tj]]); } if (tk < num_edges) { t_right = TSK_MIN(t_right, edge_right[O[tk]]); } /* Update the sites */ for (tree_site = 0; tree_site < self->tree_sites_length[tree_index]; tree_site++) { site = self->tree_sites[tree_index] + tree_site; ret = compute_general_stat_site_result(site, state, state_dim, result_dim, f, f_params, total_weight, polarised, site_result); if (ret != 0) { goto out; } while (windows[window_index + 1] <= site->position) { window_index++; tsk_bug_assert(window_index < num_windows); } tsk_bug_assert(windows[window_index] <= site->position); tsk_bug_assert(site->position < windows[window_index + 1]); result_row = GET_2D_ROW(result, result_dim, window_index); for (k = 0; k < result_dim; k++) { result_row[k] += site_result[k]; } } tree_index++; t_left = t_right; } out: /* Can't use msp_safe_free here because of restrict */ if (parent != NULL) { free(parent); } tsk_safe_free(state); tsk_safe_free(total_weight); tsk_safe_free(site_result); return ret; } static inline void increment_row(tsk_size_t length, double multiplier, double *source, double *dest) { tsk_size_t j; for (j = 0; j < length; j++) { dest[j] += multiplier * source[j]; } } static int tsk_treeseq_node_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t TSK_UNUSED(options), double *result) { int ret = 0; tsk_id_t u, v; tsk_size_t j, window_index; tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_left = self->tables->edges.left; const double *restrict edge_right = self->tables->edges.right; const tsk_id_t *restrict edge_parent = self->tables->edges.parent; const tsk_id_t *restrict edge_child = self->tables->edges.child; const double sequence_length = self->tables->sequence_length; tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); tsk_id_t tj, tk, h; const double *weight_u; double *state_u; double *state = tsk_calloc(num_nodes * state_dim, sizeof(*state)); double *node_summary = tsk_calloc(num_nodes * result_dim, sizeof(*node_summary)); double *last_update = tsk_calloc(num_nodes, sizeof(*last_update)); double t_left, t_right, w_right; if (parent == NULL || state == NULL || node_summary == NULL || last_update == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); tsk_memset(result, 0, num_windows * num_nodes * result_dim * sizeof(*result)); /* Set the initial conditions */ for (j = 0; j < self->num_samples; j++) { u = self->samples[j]; state_u = GET_2D_ROW(state, state_dim, u); weight_u = GET_2D_ROW(sample_weights, state_dim, j); tsk_memcpy(state_u, weight_u, state_dim * sizeof(*state_u)); } for (u = 0; u < (tsk_id_t) num_nodes; u++) { ret = update_node_summary( u, result_dim, node_summary, state, state_dim, f, f_params); if (ret != 0) { goto out; } } /* Iterate over the trees */ tj = 0; tk = 0; t_left = 0; window_index = 0; while (tj < num_edges || t_left < sequence_length) { tsk_bug_assert(window_index < num_windows); while (tk < num_edges && edge_right[O[tk]] == t_left) { h = O[tk]; tk++; u = edge_child[h]; v = edge_parent[h]; while (v != TSK_NULL) { increment_row(result_dim, t_left - last_update[v], GET_2D_ROW(node_summary, result_dim, v), GET_3D_ROW(result, num_nodes, result_dim, window_index, v)); last_update[v] = t_left; update_state(state, state_dim, v, u, -1); ret = update_node_summary( v, result_dim, node_summary, state, state_dim, f, f_params); if (ret != 0) { goto out; } v = parent[v]; } parent[u] = TSK_NULL; } while (tj < num_edges && edge_left[I[tj]] == t_left) { h = I[tj]; tj++; u = edge_child[h]; v = edge_parent[h]; parent[u] = v; while (v != TSK_NULL) { increment_row(result_dim, t_left - last_update[v], GET_2D_ROW(node_summary, result_dim, v), GET_3D_ROW(result, num_nodes, result_dim, window_index, v)); last_update[v] = t_left; update_state(state, state_dim, v, u, +1); ret = update_node_summary( v, result_dim, node_summary, state, state_dim, f, f_params); if (ret != 0) { goto out; } v = parent[v]; } } t_right = sequence_length; if (tj < num_edges) { t_right = TSK_MIN(t_right, edge_left[I[tj]]); } if (tk < num_edges) { t_right = TSK_MIN(t_right, edge_right[O[tk]]); } while (window_index < num_windows && windows[window_index + 1] <= t_right) { w_right = windows[window_index + 1]; /* Flush the contributions of all nodes to the current window */ for (u = 0; u < (tsk_id_t) num_nodes; u++) { tsk_bug_assert(last_update[u] < w_right); increment_row(result_dim, w_right - last_update[u], GET_2D_ROW(node_summary, result_dim, u), GET_3D_ROW(result, num_nodes, result_dim, window_index, u)); last_update[u] = w_right; } window_index++; } t_left = t_right; } out: /* Can't use msp_safe_free here because of restrict */ if (parent != NULL) { free(parent); } tsk_safe_free(state); tsk_safe_free(node_summary); tsk_safe_free(last_update); return ret; } static void span_normalise( tsk_size_t num_windows, const double *windows, tsk_size_t row_size, double *array) { tsk_size_t window_index, k; double span, *row; for (window_index = 0; window_index < num_windows; window_index++) { span = windows[window_index + 1] - windows[window_index]; row = GET_2D_ROW(array, row_size, window_index); for (k = 0; k < row_size; k++) { row[k] /= span; } } } typedef struct { general_stat_func_t *f; void *f_params; double *total_weight; double *total_minus_state; double *result_tmp; } unpolarised_summary_func_args; static int unpolarised_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t result_dim, double *result, void *params) { int ret = 0; unpolarised_summary_func_args *upargs = (unpolarised_summary_func_args *) params; const double *total_weight = upargs->total_weight; double *total_minus_state = upargs->total_minus_state; double *result_tmp = upargs->result_tmp; tsk_size_t k, m; ret = upargs->f(state_dim, state, result_dim, result, upargs->f_params); if (ret != 0) { goto out; } for (k = 0; k < state_dim; k++) { total_minus_state[k] = total_weight[k] - state[k]; } ret = upargs->f( state_dim, total_minus_state, result_dim, result_tmp, upargs->f_params); if (ret != 0) { goto out; } for (m = 0; m < result_dim; m++) { result[m] += result_tmp[m]; } out: return ret; } /* Abstracts the running of node and branch stats where the summary function * is run twice when non-polarised. We replace the call to the input summary * function with a call of the required form when non-polarised, simplifying * the implementation and memory management for the node and branch stats. */ static int tsk_polarisable_func_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; bool stat_branch = !!(options & TSK_STAT_BRANCH); bool polarised = options & TSK_STAT_POLARISED; general_stat_func_t *wrapped_f = f; void *wrapped_f_params = f_params; const double *weight_u; unpolarised_summary_func_args upargs; tsk_size_t j, k; tsk_memset(&upargs, 0, sizeof(upargs)); if (!polarised) { upargs.f = f; upargs.f_params = f_params; upargs.total_weight = tsk_calloc(state_dim, sizeof(double)); upargs.total_minus_state = tsk_calloc(state_dim, sizeof(double)); upargs.result_tmp = tsk_calloc(result_dim, sizeof(double)); if (upargs.total_weight == NULL || upargs.total_minus_state == NULL || upargs.result_tmp == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } /* Compute the total weight */ for (j = 0; j < self->num_samples; j++) { weight_u = GET_2D_ROW(sample_weights, state_dim, j); for (k = 0; k < state_dim; k++) { upargs.total_weight[k] += weight_u[k]; } } wrapped_f = unpolarised_summary_func; wrapped_f_params = &upargs; } if (stat_branch) { ret = tsk_treeseq_branch_general_stat(self, state_dim, sample_weights, result_dim, wrapped_f, wrapped_f_params, num_windows, windows, options, result); } else { ret = tsk_treeseq_node_general_stat(self, state_dim, sample_weights, result_dim, wrapped_f, wrapped_f_params, num_windows, windows, options, result); } out: tsk_safe_free(upargs.total_weight); tsk_safe_free(upargs.total_minus_state); tsk_safe_free(upargs.result_tmp); return ret; } int tsk_treeseq_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; bool stat_site = !!(options & TSK_STAT_SITE); bool stat_branch = !!(options & TSK_STAT_BRANCH); bool stat_node = !!(options & TSK_STAT_NODE); double default_windows[] = { 0, self->tables->sequence_length }; tsk_size_t row_size; /* If no mode is specified, we default to site mode */ if (!(stat_site || stat_branch || stat_node)) { stat_site = true; } /* It's an error to specify more than one mode */ if (stat_site + stat_branch + stat_node > 1) { ret = tsk_trace_error(TSK_ERR_MULTIPLE_STAT_MODES); goto out; } if (state_dim < 1) { ret = tsk_trace_error(TSK_ERR_BAD_STATE_DIMS); goto out; } if (result_dim < 1) { ret = tsk_trace_error(TSK_ERR_BAD_RESULT_DIMS); goto out; } if (windows == NULL) { num_windows = 1; windows = default_windows; } else { ret = tsk_treeseq_check_windows( self, num_windows, windows, TSK_REQUIRE_FULL_SPAN); if (ret != 0) { goto out; } } if (stat_site) { ret = tsk_treeseq_site_general_stat(self, state_dim, sample_weights, result_dim, f, f_params, num_windows, windows, options, result); } else { ret = tsk_polarisable_func_general_stat(self, state_dim, sample_weights, result_dim, f, f_params, num_windows, windows, options, result); } if (options & TSK_STAT_SPAN_NORMALISE) { row_size = result_dim; if (stat_node) { row_size = result_dim * tsk_treeseq_get_num_nodes(self); } span_normalise(num_windows, windows, row_size, result); } out: return ret; } static int check_set_indexes( tsk_size_t num_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes) { int ret = 0; tsk_size_t j; for (j = 0; j < num_set_indexes; j++) { if (set_indexes[j] < 0 || set_indexes[j] >= (tsk_id_t) num_sets) { ret = tsk_trace_error(TSK_ERR_BAD_SAMPLE_SET_INDEX); goto out; } } out: return ret; } static int tsk_treeseq_check_sample_sets(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets) { int ret = 0; tsk_size_t j, k, l; const tsk_id_t num_nodes = (tsk_id_t) self->tables->nodes.num_rows; tsk_id_t u, sample_index; if (num_sample_sets == 0) { ret = tsk_trace_error(TSK_ERR_INSUFFICIENT_SAMPLE_SETS); goto out; } j = 0; for (k = 0; k < num_sample_sets; k++) { if (sample_set_sizes[k] == 0) { ret = tsk_trace_error(TSK_ERR_EMPTY_SAMPLE_SET); goto out; } for (l = 0; l < sample_set_sizes[k]; l++) { u = sample_sets[j]; if (u < 0 || u >= num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } sample_index = self->sample_index_map[u]; if (sample_index == TSK_NULL) { ret = tsk_trace_error(TSK_ERR_BAD_SAMPLES); goto out; } j++; } } out: return ret; } typedef struct { tsk_size_t num_samples; } weight_stat_params_t; typedef struct { tsk_size_t num_samples; tsk_size_t num_covariates; double *V; } covariates_stat_params_t; typedef struct { const tsk_id_t *sample_sets; tsk_size_t num_sample_sets; const tsk_size_t *sample_set_sizes; const tsk_id_t *set_indexes; } sample_count_stat_params_t; typedef struct { tsk_size_t num_samples; double *total_weights; const tsk_id_t *index_tuples; } indexed_weight_stat_params_t; static int tsk_treeseq_sample_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; const tsk_size_t num_samples = self->num_samples; tsk_size_t j, k, l; tsk_id_t u, sample_index; double *weights = NULL; double *weight_row; sample_count_stat_params_t args = { .sample_sets = sample_sets, .num_sample_sets = num_sample_sets, .sample_set_sizes = sample_set_sizes, .set_indexes = set_indexes }; ret = tsk_treeseq_check_sample_sets( self, num_sample_sets, sample_set_sizes, sample_sets); if (ret != 0) { goto out; } weights = tsk_calloc(num_samples * num_sample_sets, sizeof(*weights)); if (weights == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } j = 0; for (k = 0; k < num_sample_sets; k++) { for (l = 0; l < sample_set_sizes[k]; l++) { u = sample_sets[j]; sample_index = self->sample_index_map[u]; weight_row = GET_2D_ROW(weights, num_sample_sets, sample_index); if (weight_row[k] != 0) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } weight_row[k] = 1; j++; } } ret = tsk_treeseq_general_stat(self, num_sample_sets, weights, result_dim, f, &args, num_windows, windows, options, result); out: tsk_safe_free(weights); return ret; } /*********************************** * Two Locus Statistics ***********************************/ static int get_allele_samples(const tsk_site_t *site, tsk_size_t site_offset, const tsk_bitset_t *state, tsk_bitset_t *out_allele_samples, tsk_size_t *out_num_alleles) { int ret = 0; tsk_mutation_t mutation, parent_mut; tsk_size_t mutation_index, allele, alt_allele, alt_allele_length; /* The allele table */ tsk_size_t max_alleles = site->mutations_length + 1; const char **alleles = tsk_malloc(max_alleles * sizeof(*alleles)); tsk_size_t *allele_lengths = tsk_calloc(max_alleles, sizeof(*allele_lengths)); const char *alt_allele_state; tsk_size_t num_alleles = 1; if (alleles == NULL || allele_lengths == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_bug_assert(state != NULL); alleles[0] = site->ancestral_state; allele_lengths[0] = site->ancestral_state_length; for (mutation_index = 0; mutation_index < site->mutations_length; mutation_index++) { mutation = site->mutations[mutation_index]; /* Compute the allele index for this derived state value. */ for (allele = 0; allele < num_alleles; allele++) { if (mutation.derived_state_length == allele_lengths[allele] && tsk_memcmp( mutation.derived_state, alleles[allele], allele_lengths[allele]) == 0) { break; } } if (allele == num_alleles) { tsk_bug_assert(allele < max_alleles); alleles[allele] = mutation.derived_state; allele_lengths[allele] = mutation.derived_state_length; num_alleles++; } /* Add the mutation's samples to this allele */ tsk_bitset_union( out_allele_samples, allele + site_offset, state, mutation_index); /* Get the index for the alternate allele that we must subtract from */ alt_allele_state = site->ancestral_state; alt_allele_length = site->ancestral_state_length; if (mutation.parent != TSK_NULL) { parent_mut = site->mutations[mutation.parent - site->mutations[0].id]; alt_allele_state = parent_mut.derived_state; alt_allele_length = parent_mut.derived_state_length; } for (alt_allele = 0; alt_allele < num_alleles; alt_allele++) { if (alt_allele_length == allele_lengths[alt_allele] && tsk_memcmp( alt_allele_state, alleles[alt_allele], allele_lengths[alt_allele]) == 0) { break; } } tsk_bug_assert(allele < num_alleles); tsk_bitset_subtract(out_allele_samples, alt_allele + site_offset, out_allele_samples, allele + site_offset); } *out_num_alleles = num_alleles; out: tsk_safe_free(alleles); tsk_safe_free(allele_lengths); return ret; } static int norm_hap_weighted(tsk_size_t result_dim, const double *hap_weights, tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *weight_row; double n; tsk_size_t k; for (k = 0; k < result_dim; k++) { weight_row = GET_2D_ROW(hap_weights, 3, k); n = (double) args.sample_set_sizes[k]; result[k] = weight_row[0] / n; } return 0; } static int norm_hap_weighted_ij(tsk_size_t result_dim, const double *hap_weights, tsk_size_t TSK_UNUSED(n_a), tsk_size_t TSK_UNUSED(n_b), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *weight_row; double ni, nj, wAB_i, wAB_j; tsk_id_t i, j; tsk_size_t k; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; ni = (double) args.sample_set_sizes[i]; nj = (double) args.sample_set_sizes[j]; weight_row = GET_2D_ROW(hap_weights, 3, i); wAB_i = weight_row[0]; weight_row = GET_2D_ROW(hap_weights, 3, j); wAB_j = weight_row[0]; result[k] = (wAB_i + wAB_j) / (ni + nj); } return 0; } static int norm_total_weighted(tsk_size_t result_dim, const double *TSK_UNUSED(hap_weights), tsk_size_t n_a, tsk_size_t n_b, double *result, void *TSK_UNUSED(params)) { tsk_size_t k; double norm = 1 / (double) (n_a * n_b); for (k = 0; k < result_dim; k++) { result[k] = norm; } return 0; } static void get_all_samples_bits(tsk_bitset_t *all_samples, tsk_size_t n) { tsk_size_t i; const tsk_bitset_val_t all = ~((tsk_bitset_val_t) 0); const tsk_bitset_val_t remainder_samples = n % TSK_BITSET_BITS; all_samples->data[all_samples->row_len - 1] = remainder_samples ? ~(all << remainder_samples) : all; for (i = 0; i < all_samples->row_len - 1; i++) { all_samples->data[i] = all; } } // Stores the intermediate values for computing two-locus statistics. typedef struct { double *weights; double *norm; double *result_tmp; tsk_bitset_t AB_samples; } two_locus_work_t; static int two_locus_work_init(tsk_size_t max_alleles, tsk_size_t num_samples, tsk_size_t result_dim, tsk_size_t state_dim, two_locus_work_t *out) { int ret = 0; out->weights = tsk_malloc(3 * state_dim * sizeof(*out->weights)); out->norm = tsk_malloc(result_dim * sizeof(*out->norm)); out->result_tmp = tsk_malloc(result_dim * max_alleles * max_alleles * sizeof(*out->result_tmp)); tsk_memset(&out->AB_samples, 0, sizeof(out->AB_samples)); if (out->weights == NULL || out->norm == NULL || out->result_tmp == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_bitset_init(&out->AB_samples, num_samples, 1); if (ret != 0) { goto out; } out: return ret; } static void two_locus_work_free(two_locus_work_t *work) { tsk_safe_free(work->weights); tsk_safe_free(work->norm); tsk_safe_free(work->result_tmp); tsk_bitset_free(&work->AB_samples); } static int compute_general_normed_two_site_stat_result(const tsk_bitset_t *state, const tsk_size_t *allele_counts, tsk_size_t a_off, tsk_size_t b_off, tsk_size_t num_a_alleles, tsk_size_t num_b_alleles, tsk_size_t state_dim, tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, norm_func_t *norm_f, bool polarised, two_locus_work_t *restrict work, double *result) { int ret = 0; // Sample sets and b sites are rows, a sites are columns // b1 b2 b3 // a1 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] // a2 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] // a3 [s1, s2, s3] [s1, s2, s3] [s1, s2, s3] tsk_size_t k, mut_a, mut_b, result_row_len = num_b_alleles * result_dim; uint8_t is_polarised = polarised ? 1 : 0; double *restrict hap_row, *restrict result_tmp_row; double *restrict norm = work->norm; double *restrict weights = work->weights; double *restrict result_tmp = work->result_tmp; tsk_bitset_t AB_samples = work->AB_samples; for (mut_a = is_polarised; mut_a < num_a_alleles; mut_a++) { result_tmp_row = GET_2D_ROW(result_tmp, result_row_len, mut_a); for (mut_b = is_polarised; mut_b < num_b_alleles; mut_b++) { for (k = 0; k < state_dim; k++) { tsk_bitset_intersect(state, a_off + (mut_a * state_dim) + k, state, b_off + (mut_b * state_dim) + k, &AB_samples); hap_row = GET_2D_ROW(weights, 3, k); hap_row[0] = (double) tsk_bitset_count(&AB_samples, 0); hap_row[1] = (double) allele_counts[a_off + (mut_a * state_dim) + k] - hap_row[0]; hap_row[2] = (double) allele_counts[b_off + (mut_b * state_dim) + k] - hap_row[0]; } ret = f(state_dim, weights, result_dim, result_tmp_row, f_params); if (ret != 0) { goto out; } ret = norm_f(result_dim, weights, num_a_alleles - is_polarised, num_b_alleles - is_polarised, norm, f_params); if (ret != 0) { goto out; } for (k = 0; k < result_dim; k++) { result[k] += result_tmp_row[k] * norm[k]; } result_tmp_row += result_dim; // Advance to the next column } } out: return ret; } static int compute_general_two_site_stat_result(const tsk_bitset_t *state, const tsk_size_t *allele_counts, tsk_size_t a_off, tsk_size_t b_off, tsk_size_t state_dim, tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, two_locus_work_t *restrict work, double *result) { int ret = 0; tsk_size_t k; tsk_bitset_t AB_samples = work->AB_samples; tsk_size_t mut_a = 1, mut_b = 1; double *restrict hap_row, *restrict weights = work->weights; for (k = 0; k < state_dim; k++) { tsk_bitset_intersect(state, a_off + (mut_a * state_dim) + k, state, b_off + (mut_b * state_dim) + k, &AB_samples); hap_row = GET_2D_ROW(weights, 3, k); hap_row[0] = (double) tsk_bitset_count(&AB_samples, 0); hap_row[1] = (double) allele_counts[a_off + (mut_a * state_dim) + k] - hap_row[0]; hap_row[2] = (double) allele_counts[b_off + (mut_b * state_dim) + k] - hap_row[0]; } ret = f(state_dim, weights, result_dim, result, f_params); if (ret != 0) { goto out; } out: return ret; } static void get_site_row_col_indices(tsk_size_t n_rows, const tsk_id_t *row_sites, tsk_size_t n_cols, const tsk_id_t *col_sites, tsk_id_t *sites, tsk_size_t *n_sites, tsk_size_t *row_idx, tsk_size_t *col_idx) { tsk_size_t r = 0, c = 0, s = 0; // Iterate rows and columns until we've exhaused one of the lists while ((r < n_rows) && (c < n_cols)) { if (row_sites[r] < col_sites[c]) { sites[s] = row_sites[r]; row_idx[r] = s; s++; r++; } else if (col_sites[c] < row_sites[r]) { sites[s] = col_sites[c]; col_idx[c] = s; s++; c++; } else { // row == col sites[s] = row_sites[r]; col_idx[c] = s; row_idx[r] = s; s++; r++; c++; } } // If there are any items remaining in the other list, drain it while (r < n_rows) { sites[s] = row_sites[r]; row_idx[r] = s; s++; r++; } while (c < n_cols) { sites[s] = col_sites[c]; col_idx[c] = s; s++; c++; } *n_sites = s; } static int get_mutation_samples(const tsk_treeseq_t *ts, const tsk_id_t *sites, tsk_size_t n_sites, tsk_size_t *num_alleles, tsk_bitset_t *allele_samples) { int ret = 0; const tsk_flags_t *restrict flags = ts->tables->nodes.flags; const tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); const tsk_size_t *restrict site_muts_len = ts->site_mutations_length; tsk_site_t site; tsk_tree_t tree; tsk_bitset_t all_samples_bits, mut_samples; tsk_size_t max_muts_len, site_offset, num_nodes, site_idx, s, m, n; tsk_id_t node, *nodes = NULL; void *tmp_nodes; tsk_memset(&mut_samples, 0, sizeof(mut_samples)); tsk_memset(&all_samples_bits, 0, sizeof(all_samples_bits)); max_muts_len = 0; for (s = 0; s < n_sites; s++) { max_muts_len = TSK_MAX(site_muts_len[sites[s]], max_muts_len); } // Allocate a bit array of size max alleles for all sites ret = tsk_bitset_init(&mut_samples, num_samples, max_muts_len); if (ret != 0) { goto out; } ret = tsk_bitset_init(&all_samples_bits, num_samples, 1); if (ret != 0) { goto out; } get_all_samples_bits(&all_samples_bits, num_samples); ret = tsk_tree_init(&tree, ts, TSK_NO_SAMPLE_COUNTS); if (ret != 0) { goto out; } // For each mutation within each site, perform one preorder traversal to gather // the samples under each mutation's node. site_offset = 0; for (site_idx = 0; site_idx < n_sites; site_idx++) { tsk_treeseq_get_site(ts, sites[site_idx], &site); ret = tsk_tree_seek(&tree, site.position, 0); if (ret != 0) { goto out; } tmp_nodes = tsk_realloc(nodes, tsk_tree_get_size_bound(&tree) * sizeof(*nodes)); if (tmp_nodes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } nodes = tmp_nodes; tsk_bitset_union(allele_samples, site_offset, &all_samples_bits, 0); // Zero out results before the start of each iteration tsk_memset(mut_samples.data, 0, mut_samples.row_len * max_muts_len * sizeof(tsk_bitset_val_t)); for (m = 0; m < site.mutations_length; m++) { node = site.mutations[m].node; ret = tsk_tree_preorder_from(&tree, node, nodes, &num_nodes); if (ret != 0) { goto out; } for (n = 0; n < num_nodes; n++) { node = nodes[n]; if (flags[node] & TSK_NODE_IS_SAMPLE) { tsk_bitset_set_bit( &mut_samples, m, (tsk_bitset_val_t) ts->sample_index_map[node]); } } } get_allele_samples( &site, site_offset, &mut_samples, allele_samples, &(num_alleles[site_idx])); site_offset += site.mutations_length + 1; } // if adding code below, check ret before continuing out: tsk_safe_free(nodes); tsk_tree_free(&tree); tsk_bitset_free(&mut_samples); tsk_bitset_free(&all_samples_bits); return ret == TSK_TREE_OK ? 0 : ret; } // Given the samples under each allele's node and the sample sets, get the samples under // each allele's node for each sample set. We pack this data into a bitset // (`allele_sample_sets`) that is size m x n, where m is (n_alleles * num_sample_sets) // and n is the size of the largest sample set. In addition, we compute the number of // samples contained in the intersection of each allele's samples and each sample set in // an array (`allele_sample_sets`) of length (n_alleles * num_sample_sets). static void get_mutation_sample_sets(const tsk_bitset_t *allele_samples, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, const tsk_id_t *sample_index_map, tsk_bitset_t *allele_sample_sets, tsk_size_t *allele_sample_set_counts) { tsk_bitset_val_t k, sample; tsk_size_t i, j, ss_off; for (i = 0; i < allele_samples->len; i++) { ss_off = 0; for (j = 0; j < num_sample_sets; j++) { for (k = 0; k < sample_set_sizes[j]; k++) { sample = (tsk_bitset_val_t) sample_index_map[sample_sets[k + ss_off]]; if (tsk_bitset_contains(allele_samples, i, sample)) { tsk_bitset_set_bit(allele_sample_sets, j + i * num_sample_sets, k); allele_sample_set_counts[j + i * num_sample_sets]++; } } ss_off += sample_set_sizes[j]; } } } static int tsk_treeseq_two_site_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, norm_func_t *norm_f, tsk_size_t n_rows, const tsk_id_t *row_sites, tsk_size_t n_cols, const tsk_id_t *col_sites, tsk_flags_t options, double *result) { int ret = 0; tsk_bitset_t allele_samples, allele_sample_sets; bool polarised = options & TSK_STAT_POLARISED; tsk_id_t *sites; tsk_size_t i, j, n_sites, *row_idx, *col_idx; double *result_row; const tsk_size_t num_samples = self->num_samples; tsk_size_t *num_alleles = NULL, *site_offsets = NULL, *allele_counts = NULL; tsk_size_t result_row_len = n_cols * result_dim; tsk_size_t max_ss_size = 0, max_alleles = 0, n_alleles = 0; two_locus_work_t work; tsk_memset(&work, 0, sizeof(work)); tsk_memset(&allele_samples, 0, sizeof(allele_samples)); tsk_memset(&allele_sample_sets, 0, sizeof(allele_sample_sets)); sites = tsk_malloc(self->tables->sites.num_rows * sizeof(*sites)); row_idx = tsk_malloc(self->tables->sites.num_rows * sizeof(*row_idx)); col_idx = tsk_malloc(self->tables->sites.num_rows * sizeof(*col_idx)); if (sites == NULL || row_idx == NULL || col_idx == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } get_site_row_col_indices( n_rows, row_sites, n_cols, col_sites, sites, &n_sites, row_idx, col_idx); // depends on n_sites num_alleles = tsk_malloc(n_sites * sizeof(*num_alleles)); site_offsets = tsk_malloc(n_sites * sizeof(*site_offsets)); if (num_alleles == NULL || site_offsets == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (i = 0; i < n_sites; i++) { site_offsets[i] = n_alleles * num_sample_sets; n_alleles += self->site_mutations_length[sites[i]] + 1; max_alleles = TSK_MAX(self->site_mutations_length[sites[i]], max_alleles); } max_alleles++; // add 1 for the ancestral allele // depends on n_alleles ret = tsk_bitset_init(&allele_samples, num_samples, n_alleles); if (ret != 0) { goto out; } for (i = 0; i < num_sample_sets; i++) { max_ss_size = TSK_MAX(sample_set_sizes[i], max_ss_size); } // depend on n_alleles and max_ss_size ret = tsk_bitset_init(&allele_sample_sets, max_ss_size, n_alleles * num_sample_sets); if (ret != 0) { goto out; } allele_counts = tsk_calloc(n_alleles * num_sample_sets, sizeof(*allele_counts)); if (allele_counts == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } // depends on max_ss_size and max_alleles ret = two_locus_work_init(max_alleles, max_ss_size, result_dim, state_dim, &work); if (ret != 0) { goto out; } // we track the number of alleles to account for backmutations ret = get_mutation_samples(self, sites, n_sites, num_alleles, &allele_samples); if (ret != 0) { goto out; } get_mutation_sample_sets(&allele_samples, num_sample_sets, sample_set_sizes, sample_sets, self->sample_index_map, &allele_sample_sets, allele_counts); // For each row/column pair, fill in the sample set in the result matrix. for (i = 0; i < n_rows; i++) { result_row = GET_2D_ROW(result, result_row_len, i); for (j = 0; j < n_cols; j++) { if (num_alleles[row_idx[i]] == 2 && num_alleles[col_idx[j]] == 2) { // both sites are biallelic ret = compute_general_two_site_stat_result(&allele_sample_sets, allele_counts, site_offsets[row_idx[i]], site_offsets[col_idx[j]], state_dim, result_dim, f, f_params, &work, &(result_row[j * result_dim])); } else { // at least one site is multiallelic ret = compute_general_normed_two_site_stat_result(&allele_sample_sets, allele_counts, site_offsets[row_idx[i]], site_offsets[col_idx[j]], num_alleles[row_idx[i]], num_alleles[col_idx[j]], state_dim, result_dim, f, f_params, norm_f, polarised, &work, &(result_row[j * result_dim])); } if (ret != 0) { goto out; } } } out: tsk_safe_free(sites); tsk_safe_free(row_idx); tsk_safe_free(col_idx); tsk_safe_free(num_alleles); tsk_safe_free(site_offsets); tsk_safe_free(allele_counts); two_locus_work_free(&work); tsk_bitset_free(&allele_samples); tsk_bitset_free(&allele_sample_sets); return ret; } static int sample_sets_to_bitset(const tsk_treeseq_t *self, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_sample_sets, tsk_bitset_t *sample_sets_bits) { int ret; tsk_size_t j, k, l; tsk_id_t u, sample_index; ret = tsk_bitset_init(sample_sets_bits, self->num_samples, num_sample_sets); if (ret != 0) { return ret; } j = 0; for (k = 0; k < num_sample_sets; k++) { for (l = 0; l < sample_set_sizes[k]; l++) { u = sample_sets[j]; sample_index = self->sample_index_map[u]; if (tsk_bitset_contains( sample_sets_bits, k, (tsk_bitset_val_t) sample_index)) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } tsk_bitset_set_bit(sample_sets_bits, k, (tsk_bitset_val_t) sample_index); j++; } } out: return ret; } static int check_sites(const tsk_id_t *sites, tsk_size_t num_sites, tsk_size_t num_site_rows) { int ret = 0; tsk_size_t i; if (num_sites == 0) { return ret; // No need to verify sites if there aren't any } for (i = 0; i < num_sites - 1; i++) { if (sites[i] < 0 || sites[i] >= (tsk_id_t) num_site_rows) { ret = tsk_trace_error(TSK_ERR_SITE_OUT_OF_BOUNDS); goto out; } if (sites[i] > sites[i + 1]) { ret = tsk_trace_error(TSK_ERR_STAT_UNSORTED_SITES); goto out; } if (sites[i] == sites[i + 1]) { ret = tsk_trace_error(TSK_ERR_STAT_DUPLICATE_SITES); goto out; } } // check the last value if (sites[i] < 0 || sites[i] >= (tsk_id_t) num_site_rows) { ret = tsk_trace_error(TSK_ERR_SITE_OUT_OF_BOUNDS); goto out; } out: return ret; } static int check_positions( const double *positions, tsk_size_t num_positions, double sequence_length) { int ret = 0; tsk_size_t i; if (num_positions == 0) { return ret; // No need to verify positions if there aren't any } for (i = 0; i < num_positions - 1; i++) { if (positions[i] < 0 || positions[i] >= sequence_length) { ret = tsk_trace_error(TSK_ERR_POSITION_OUT_OF_BOUNDS); goto out; } if (positions[i] > positions[i + 1]) { ret = tsk_trace_error(TSK_ERR_STAT_UNSORTED_POSITIONS); goto out; } if (positions[i] == positions[i + 1]) { ret = tsk_trace_error(TSK_ERR_STAT_DUPLICATE_POSITIONS); goto out; } } // check bounds of last value if (positions[i] < 0 || positions[i] >= sequence_length) { ret = tsk_trace_error(TSK_ERR_POSITION_OUT_OF_BOUNDS); goto out; } out: return ret; } static int positions_to_tree_indexes(const tsk_treeseq_t *ts, const double *positions, tsk_size_t num_positions, tsk_id_t **tree_indexes) { int ret = 0; tsk_id_t tree_index = 0; tsk_size_t i, num_trees = ts->num_trees; // This is tricky. If there are 0 positions, we calloc a size of 1 // we must calloc, because memset will have no effect when called with size 0 *tree_indexes = tsk_calloc(num_positions, sizeof(*tree_indexes)); if (tree_indexes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(*tree_indexes, TSK_NULL, num_positions * sizeof(**tree_indexes)); for (i = 0; i < num_positions; i++) { while (ts->breakpoints[tree_index + 1] <= positions[i]) { tree_index++; } (*tree_indexes)[i] = tree_index; } tsk_bug_assert(tree_index <= (tsk_id_t) (num_trees - 1)); out: return ret; } static int get_index_counts( const tsk_id_t *indexes, tsk_size_t num_indexes, tsk_size_t **out_counts) { int ret = 0; tsk_id_t index = indexes[0]; tsk_size_t count, i; tsk_size_t *counts = tsk_calloc( (tsk_size_t) (indexes[num_indexes ? num_indexes - 1 : 0] - indexes[0] + 1), sizeof(*counts)); if (counts == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } count = 1; for (i = 1; i < num_indexes; i++) { if (indexes[i] == indexes[i - 1]) { count++; } else { counts[index - indexes[0]] = count; count = 1; index = indexes[i]; } } counts[index - indexes[0]] = count; *out_counts = counts; out: return ret; } typedef struct { tsk_tree_t tree; tsk_bitset_t *node_samples; tsk_id_t *parent; tsk_id_t *edges_out; tsk_id_t *edges_in; double *branch_len; tsk_size_t n_edges_out; tsk_size_t n_edges_in; } iter_state; static int iter_state_init(iter_state *self, const tsk_treeseq_t *ts, tsk_size_t state_dim) { int ret = 0; const tsk_size_t num_nodes = ts->tables->nodes.num_rows; ret = tsk_tree_init(&self->tree, ts, TSK_NO_SAMPLE_COUNTS); if (ret != 0) { goto out; } self->node_samples = tsk_calloc(1, sizeof(*self->node_samples)); if (self->node_samples == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_bitset_init(self->node_samples, ts->num_samples, state_dim * num_nodes); if (ret != 0) { goto out; } self->parent = tsk_malloc(num_nodes * sizeof(*self->parent)); self->edges_out = tsk_malloc(num_nodes * sizeof(*self->edges_out)); self->edges_in = tsk_malloc(num_nodes * sizeof(*self->edges_in)); self->branch_len = tsk_calloc(num_nodes, sizeof(*self->branch_len)); if (self->parent == NULL || self->edges_out == NULL || self->edges_in == NULL || self->branch_len == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } out: return ret; } static int get_node_samples(const tsk_treeseq_t *ts, tsk_size_t state_dim, const tsk_bitset_t *sample_sets, tsk_bitset_t *node_samples) { int ret = 0; tsk_size_t n, k; tsk_size_t num_nodes = ts->tables->nodes.num_rows; tsk_bitset_val_t sample; const tsk_id_t *restrict sample_index_map = ts->sample_index_map; const tsk_flags_t *restrict flags = ts->tables->nodes.flags; ret = tsk_bitset_init(node_samples, ts->num_samples, num_nodes * state_dim); if (ret != 0) { goto out; } for (k = 0; k < state_dim; k++) { for (n = 0; n < num_nodes; n++) { if (flags[n] & TSK_NODE_IS_SAMPLE) { sample = (tsk_bitset_val_t) sample_index_map[n]; if (tsk_bitset_contains(sample_sets, k, sample)) { tsk_bitset_set_bit(node_samples, (state_dim * n) + k, sample); } } } } out: return ret; } static void iter_state_clear(iter_state *self, tsk_size_t state_dim, tsk_size_t num_nodes, const tsk_bitset_t *node_samples) { self->n_edges_out = 0; self->n_edges_in = 0; tsk_tree_clear(&self->tree); tsk_memset(self->parent, TSK_NULL, num_nodes * sizeof(*self->parent)); tsk_memset(self->edges_out, TSK_NULL, num_nodes * sizeof(*self->edges_out)); tsk_memset(self->edges_in, TSK_NULL, num_nodes * sizeof(*self->edges_in)); tsk_memset(self->branch_len, 0, num_nodes * sizeof(*self->branch_len)); tsk_memcpy(self->node_samples->data, node_samples->data, node_samples->row_len * state_dim * num_nodes * sizeof(*node_samples->data)); } static void iter_state_free(iter_state *self) { tsk_tree_free(&self->tree); tsk_bitset_free(self->node_samples); tsk_safe_free(self->node_samples); tsk_safe_free(self->parent); tsk_safe_free(self->edges_out); tsk_safe_free(self->edges_in); tsk_safe_free(self->branch_len); } static int advance_collect_edges(iter_state *s, tsk_id_t index) { int ret = 0; tsk_id_t j, e; tsk_size_t i; double left, right; tsk_tree_position_t pos; tsk_tree_t *tree = &s->tree; const double *restrict edge_left = tree->tree_sequence->tables->edges.left; const double *restrict edge_right = tree->tree_sequence->tables->edges.right; // Either we're seeking forward one step from some nonzero position in the tree, or // from the beginning of the tree sequence. if (tree->index != TSK_NULL || index == 0) { ret = tsk_tree_next(tree); if (ret < 0) { goto out; } pos = tree->tree_pos; i = 0; for (j = pos.out.start; j != pos.out.stop; j++) { s->edges_out[i] = pos.out.order[j]; i++; } s->n_edges_out = i; i = 0; for (j = pos.in.start; j != pos.in.stop; j++) { s->edges_in[i] = pos.in.order[j]; i++; } s->n_edges_in = i; } else { // Seek from an arbitrary nonzero position from an uninitialized tree. tsk_bug_assert(tree->index == -1); ret = tsk_tree_seek_index(tree, index, 0); if (ret < 0) { goto out; } pos = tree->tree_pos; i = 0; if (pos.direction == TSK_DIR_FORWARD) { left = pos.interval.left; for (j = pos.in.start; j != pos.in.stop; j++) { e = pos.in.order[j]; if (edge_left[e] <= left && left < edge_right[e]) { s->edges_in[i] = pos.in.order[j]; i++; } } } else { right = pos.interval.right; for (j = pos.in.start; j != pos.in.stop; j--) { e = pos.in.order[j]; if (edge_right[e] >= right && right > edge_left[e]) { s->edges_in[i] = pos.in.order[j]; i++; } } } s->n_edges_out = 0; s->n_edges_in = i; } ret = 0; out: return ret; } static int compute_two_tree_branch_state_update(const tsk_treeseq_t *ts, tsk_id_t c, const iter_state *A_state, const iter_state *B_state, tsk_size_t state_dim, tsk_size_t result_dim, int sign, general_stat_func_t *f, sample_count_stat_params_t *f_params, two_locus_work_t *restrict work, double *result) { int ret = 0; double a_len, b_len; double *restrict B_branch_len = B_state->branch_len; double *weights_row; tsk_size_t n, k, a_row, b_row; const double *restrict A_branch_len = A_state->branch_len; const tsk_bitset_t *restrict A_state_samples = A_state->node_samples; const tsk_bitset_t *restrict B_state_samples = B_state->node_samples; tsk_size_t num_nodes = ts->tables->nodes.num_rows; double *weights = work->weights; double *result_tmp = work->result_tmp; tsk_bitset_t AB_samples = work->AB_samples; b_len = B_branch_len[c] * sign; if (b_len == 0) { return ret; } for (n = 0; n < num_nodes; n++) { a_len = A_branch_len[n]; if (a_len == 0) { continue; } for (k = 0; k < state_dim; k++) { a_row = (state_dim * n) + k; b_row = (state_dim * (tsk_size_t) c) + k; weights_row = GET_2D_ROW(weights, 3, k); tsk_bitset_intersect( A_state_samples, a_row, B_state_samples, b_row, &AB_samples); weights_row[0] = (double) tsk_bitset_count(&AB_samples, 0); weights_row[1] = (double) tsk_bitset_count(A_state_samples, a_row) - weights_row[0]; weights_row[2] = (double) tsk_bitset_count(B_state_samples, b_row) - weights_row[0]; } ret = f(state_dim, weights, result_dim, result_tmp, f_params); if (ret != 0) { goto out; } for (k = 0; k < result_dim; k++) { result[k] += result_tmp[k] * a_len * b_len; } } out: return ret; } static int compute_two_tree_branch_stat(const tsk_treeseq_t *ts, const iter_state *l_state, iter_state *r_state, general_stat_func_t *f, sample_count_stat_params_t *f_params, tsk_size_t result_dim, tsk_size_t state_dim, double *result) { int ret = 0; tsk_id_t e, c, ec, p, *updated_nodes = NULL; tsk_size_t j, k, n_updates; const double *restrict time = ts->tables->nodes.time; const tsk_id_t *restrict edges_child = ts->tables->edges.child; const tsk_id_t *restrict edges_parent = ts->tables->edges.parent; const tsk_size_t num_nodes = ts->tables->nodes.num_rows; tsk_bitset_t updates, *r_samples = r_state->node_samples; two_locus_work_t work; tsk_memset(&work, 0, sizeof(work)); tsk_memset(&updates, 0, sizeof(updates)); // only two alleles are possible for branch stats ret = two_locus_work_init(2, ts->num_samples, result_dim, state_dim, &work); if (ret != 0) { goto out; } ret = tsk_bitset_init(&updates, num_nodes, 1); if (ret != 0) { goto out; } updated_nodes = tsk_calloc(num_nodes, sizeof(*updated_nodes)); if (updated_nodes == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } // Identify modified nodes both added and removed for (j = 0; j < r_state->n_edges_out + r_state->n_edges_in; j++) { e = j < r_state->n_edges_out ? r_state->edges_out[j] : r_state->edges_in[j - r_state->n_edges_out]; p = edges_parent[e]; c = edges_child[e]; // Identify affected nodes above child while (p != TSK_NULL) { tsk_bitset_set_bit(&updates, 0, (tsk_bitset_val_t) c); c = p; p = r_state->parent[p]; } } // Subtract the whole contribution from the child node tsk_bitset_get_items(&updates, 0, updated_nodes, &n_updates); while (n_updates != 0) { n_updates--; c = updated_nodes[n_updates]; compute_two_tree_branch_state_update(ts, c, l_state, r_state, state_dim, result_dim, -1, f, f_params, &work, result); } // Remove samples under nodes from removed edges to parent nodes for (j = 0; j < r_state->n_edges_out; j++) { e = r_state->edges_out[j]; p = edges_parent[e]; ec = edges_child[e]; // edge child while (p != TSK_NULL) { for (k = 0; k < state_dim; k++) { tsk_bitset_subtract(r_samples, (state_dim * (tsk_size_t) p) + k, r_samples, (state_dim * (tsk_size_t) ec) + k); } p = r_state->parent[p]; } r_state->branch_len[ec] = 0; r_state->parent[ec] = TSK_NULL; } // Add samples under nodes from added edges for (j = 0; j < r_state->n_edges_in; j++) { e = r_state->edges_in[j]; p = edges_parent[e]; ec = c = edges_child[e]; r_state->branch_len[c] = time[p] - time[c]; r_state->parent[c] = p; while (p != TSK_NULL) { tsk_bitset_set_bit(&updates, 0, (tsk_bitset_val_t) c); for (k = 0; k < state_dim; k++) { tsk_bitset_union(r_samples, (state_dim * (tsk_size_t) p) + k, r_samples, (state_dim * (tsk_size_t) ec) + k); } c = p; p = r_state->parent[p]; } } // Update all affected child nodes (fully subtracted, deferred from addition) n_updates = 0; tsk_bitset_get_items(&updates, 0, updated_nodes, &n_updates); while (n_updates != 0) { n_updates--; c = updated_nodes[n_updates]; compute_two_tree_branch_state_update(ts, c, l_state, r_state, state_dim, result_dim, +1, f, f_params, &work, result); } out: tsk_safe_free(updated_nodes); two_locus_work_free(&work); tsk_bitset_free(&updates); return ret; } static int tsk_treeseq_two_branch_count_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t result_dim, general_stat_func_t *f, sample_count_stat_params_t *f_params, norm_func_t *TSK_UNUSED(norm_f), tsk_size_t n_rows, const double *row_positions, tsk_size_t n_cols, const double *col_positions, tsk_flags_t TSK_UNUSED(options), double *result) { int ret = 0; int r, c; tsk_id_t *row_indexes = NULL, *col_indexes = NULL; tsk_size_t i, j, k, row, col, *row_repeats = NULL, *col_repeats = NULL; tsk_bitset_t node_samples, sample_sets_bits; iter_state l_state, r_state; double *result_tmp = NULL, *result_row; const tsk_size_t num_nodes = self->tables->nodes.num_rows; tsk_memset(&sample_sets_bits, 0, sizeof(sample_sets_bits)); tsk_memset(&node_samples, 0, sizeof(node_samples)); tsk_memset(&l_state, 0, sizeof(l_state)); tsk_memset(&r_state, 0, sizeof(r_state)); result_tmp = tsk_malloc(result_dim * sizeof(*result_tmp)); if (result_tmp == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = iter_state_init(&l_state, self, state_dim); if (ret != 0) { goto out; } ret = iter_state_init(&r_state, self, state_dim); if (ret != 0) { goto out; } ret = sample_sets_to_bitset( self, sample_set_sizes, sample_sets, num_sample_sets, &sample_sets_bits); if (ret != 0) { goto out; } ret = positions_to_tree_indexes(self, row_positions, n_rows, &row_indexes); if (ret != 0) { goto out; } ret = positions_to_tree_indexes(self, col_positions, n_cols, &col_indexes); if (ret != 0) { goto out; } ret = get_index_counts(row_indexes, n_rows, &row_repeats); if (ret != 0) { goto out; } ret = get_index_counts(col_indexes, n_cols, &col_repeats); if (ret != 0) { goto out; } ret = get_node_samples(self, state_dim, &sample_sets_bits, &node_samples); if (ret != 0) { goto out; } iter_state_clear(&l_state, state_dim, num_nodes, &node_samples); row = 0; for (r = 0; r < (row_indexes[n_rows ? n_rows - 1U : 0] - row_indexes[0] + 1); r++) { tsk_memset(result_tmp, 0, result_dim * sizeof(*result_tmp)); iter_state_clear(&r_state, state_dim, num_nodes, &node_samples); ret = advance_collect_edges(&l_state, (tsk_id_t) r + row_indexes[0]); if (ret != 0) { goto out; } result_row = GET_2D_ROW(result, result_dim * n_cols, row); ret = compute_two_tree_branch_stat( self, &r_state, &l_state, f, f_params, result_dim, state_dim, result_tmp); if (ret != 0) { goto out; } col = 0; for (c = 0; c < (col_indexes[n_cols ? n_cols - 1 : 0] - col_indexes[0] + 1); c++) { ret = advance_collect_edges(&r_state, (tsk_id_t) c + col_indexes[0]); if (ret != 0) { goto out; } ret = compute_two_tree_branch_stat(self, &l_state, &r_state, f, f_params, result_dim, state_dim, result_tmp); if (ret != 0) { goto out; } for (i = 0; i < row_repeats[r]; i++) { for (j = 0; j < col_repeats[c]; j++) { result_row = GET_2D_ROW(result, result_dim * n_cols, row + i); for (k = 0; k < result_dim; k++) { result_row[col + (j * result_dim) + k] = result_tmp[k]; } } } col += (col_repeats[c] * result_dim); } row += row_repeats[r]; } out: tsk_safe_free(result_tmp); tsk_safe_free(row_indexes); tsk_safe_free(col_indexes); tsk_safe_free(row_repeats); tsk_safe_free(col_repeats); iter_state_free(&l_state); iter_state_free(&r_state); tsk_bitset_free(&node_samples); tsk_bitset_free(&sample_sets_bits); return ret; } static int check_sample_set_dups(tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, const tsk_id_t *restrict sample_index_map, tsk_size_t num_samples) { int ret; tsk_size_t j, k, l; tsk_id_t u, sample_index; tsk_bitset_t tmp; tsk_memset(&tmp, 0, sizeof(tmp)); ret = tsk_bitset_init(&tmp, num_samples, 1); if (ret != 0) { goto out; } j = 0; for (k = 0; k < num_sample_sets; k++) { tsk_memset(tmp.data, 0, sizeof(*tmp.data) * tmp.row_len); for (l = 0; l < sample_set_sizes[k]; l++) { u = sample_sets[j]; sample_index = sample_index_map[u]; if (tsk_bitset_contains(&tmp, 0, (tsk_bitset_val_t) sample_index)) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } tsk_bitset_set_bit(&tmp, 0, (tsk_bitset_val_t) sample_index); j++; } } out: tsk_bitset_free(&tmp); return ret; } int tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f, norm_func_t *norm_f, tsk_size_t out_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t out_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { // TODO: generalize this function if we ever decide to do weighted two_locus stats. // We only implement count stats and therefore we don't handle weights. int ret = 0; bool stat_site = !!(options & TSK_STAT_SITE); bool stat_branch = !!(options & TSK_STAT_BRANCH); tsk_size_t state_dim = num_sample_sets; sample_count_stat_params_t f_params = { .sample_sets = sample_sets, .num_sample_sets = num_sample_sets, .sample_set_sizes = sample_set_sizes, .set_indexes = set_indexes }; // We do not support two-locus node stats if (!!(options & TSK_STAT_NODE)) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_STAT_MODE); goto out; } // If no mode is specified, we default to site mode if (!(stat_site || stat_branch)) { stat_site = true; } // It's an error to specify more than one mode if (stat_site + stat_branch > 1) { ret = tsk_trace_error(TSK_ERR_MULTIPLE_STAT_MODES); goto out; } ret = tsk_treeseq_check_sample_sets( self, num_sample_sets, sample_set_sizes, sample_sets); if (ret != 0) { goto out; } if (result_dim < 1) { ret = tsk_trace_error(TSK_ERR_BAD_RESULT_DIMS); goto out; } if (stat_site) { ret = check_sites(row_sites, out_rows, self->tables->sites.num_rows); if (ret != 0) { goto out; } ret = check_sites(col_sites, out_cols, self->tables->sites.num_rows); if (ret != 0) { goto out; } ret = check_sample_set_dups(num_sample_sets, sample_set_sizes, sample_sets, self->sample_index_map, self->num_samples); if (ret != 0) { goto out; } ret = tsk_treeseq_two_site_count_stat(self, state_dim, num_sample_sets, sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows, row_sites, out_cols, col_sites, options, result); } else if (stat_branch) { ret = check_positions( row_positions, out_rows, tsk_treeseq_get_sequence_length(self)); if (ret != 0) { goto out; } ret = check_positions( col_positions, out_cols, tsk_treeseq_get_sequence_length(self)); if (ret != 0) { goto out; } ret = tsk_treeseq_two_branch_count_stat(self, state_dim, num_sample_sets, sample_set_sizes, sample_sets, result_dim, f, &f_params, norm_f, out_rows, row_positions, out_cols, col_positions, options, result); } out: return ret; } /*********************************** * Allele frequency spectrum ***********************************/ static inline void fold(tsk_size_t *restrict coordinate, const tsk_size_t *restrict dims, tsk_size_t num_dims) { tsk_size_t k; double n = 0; int s = 0; for (k = 0; k < num_dims; k++) { tsk_bug_assert(coordinate[k] < dims[k]); n += (double) dims[k] - 1; s += (int) coordinate[k]; } n /= 2; k = num_dims; while (s == n && k > 0) { k--; n -= ((double) (dims[k] - 1)) / 2; s -= (int) coordinate[k]; } if (s > n) { for (k = 0; k < num_dims; k++) { s = (int) (dims[k] - 1 - coordinate[k]); tsk_bug_assert(s >= 0); coordinate[k] = (tsk_size_t) s; } } } static int tsk_treeseq_update_site_afs(const tsk_treeseq_t *self, const tsk_site_t *site, const double *total_counts, const double *counts, tsk_size_t num_sample_sets, tsk_size_t window_index, tsk_size_t *result_dims, tsk_flags_t options, double *result) { int ret = 0; tsk_size_t afs_size; tsk_size_t k, allele, num_alleles, all_samples; double increment, *afs, *allele_counts, *allele_count; tsk_size_t *coordinate = tsk_malloc(num_sample_sets * sizeof(*coordinate)); bool polarised = !!(options & TSK_STAT_POLARISED); const tsk_size_t K = num_sample_sets + 1; if (coordinate == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = get_allele_weights( site, counts, K, total_counts, &num_alleles, &allele_counts); if (ret != 0) { goto out; } afs_size = result_dims[num_sample_sets]; afs = result + afs_size * window_index; increment = polarised ? 1 : 0.5; /* Sum over the allele weights. Skip the ancestral state if polarised. */ for (allele = polarised ? 1 : 0; allele < num_alleles; allele++) { allele_count = GET_2D_ROW(allele_counts, K, allele); all_samples = (tsk_size_t) allele_count[num_sample_sets]; if (all_samples > 0 && all_samples < self->num_samples) { for (k = 0; k < num_sample_sets; k++) { coordinate[k] = (tsk_size_t) allele_count[k]; } if (!polarised) { fold(coordinate, result_dims, num_sample_sets); } increment_nd_array_value( afs, num_sample_sets, result_dims, coordinate, increment); } } out: tsk_safe_free(coordinate); tsk_safe_free(allele_counts); return ret; } static int tsk_treeseq_site_allele_frequency_spectrum(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, double *counts, tsk_size_t num_windows, const double *windows, tsk_size_t *result_dims, tsk_flags_t options, double *result) { int ret = 0; tsk_id_t u, v; tsk_size_t tree_site, tree_index, window_index; tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_left = self->tables->edges.left; const double *restrict edge_right = self->tables->edges.right; const tsk_id_t *restrict edge_parent = self->tables->edges.parent; const tsk_id_t *restrict edge_child = self->tables->edges.child; const double sequence_length = self->tables->sequence_length; tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); tsk_site_t *site; tsk_id_t tj, tk, h; tsk_size_t j; const tsk_size_t K = num_sample_sets + 1; double t_left, t_right; double *total_counts = tsk_malloc((1 + num_sample_sets) * sizeof(*total_counts)); if (parent == NULL || total_counts == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); for (j = 0; j < num_sample_sets; j++) { total_counts[j] = (double) sample_set_sizes[j]; } total_counts[num_sample_sets] = (double) self->num_samples; /* Iterate over the trees */ tj = 0; tk = 0; t_left = 0; tree_index = 0; window_index = 0; while (tj < num_edges || t_left < sequence_length) { while (tk < num_edges && edge_right[O[tk]] == t_left) { h = O[tk]; tk++; u = edge_child[h]; v = edge_parent[h]; while (v != TSK_NULL) { update_state(counts, K, v, u, -1); v = parent[v]; } parent[u] = TSK_NULL; } while (tj < num_edges && edge_left[I[tj]] == t_left) { h = I[tj]; tj++; u = edge_child[h]; v = edge_parent[h]; parent[u] = v; while (v != TSK_NULL) { update_state(counts, K, v, u, +1); v = parent[v]; } } t_right = sequence_length; if (tj < num_edges) { t_right = TSK_MIN(t_right, edge_left[I[tj]]); } if (tk < num_edges) { t_right = TSK_MIN(t_right, edge_right[O[tk]]); } /* Update the sites */ for (tree_site = 0; tree_site < self->tree_sites_length[tree_index]; tree_site++) { site = self->tree_sites[tree_index] + tree_site; while (windows[window_index + 1] <= site->position) { window_index++; tsk_bug_assert(window_index < num_windows); } ret = tsk_treeseq_update_site_afs(self, site, total_counts, counts, num_sample_sets, window_index, result_dims, options, result); if (ret != 0) { goto out; } tsk_bug_assert(windows[window_index] <= site->position); tsk_bug_assert(site->position < windows[window_index + 1]); } tree_index++; t_left = t_right; } out: /* Can't use msp_safe_free here because of restrict */ if (parent != NULL) { free(parent); } tsk_safe_free(total_counts); return ret; } static void tsk_treeseq_update_branch_afs(const tsk_treeseq_t *self, tsk_id_t u, double right, double *restrict last_update, const double *restrict time, tsk_id_t *restrict parent, tsk_size_t *restrict coordinate, const double *counts, tsk_size_t num_sample_sets, tsk_size_t num_time_windows, const double *time_windows, tsk_size_t window_index, const tsk_size_t *result_dims, tsk_flags_t options, double *result) { tsk_size_t afs_size; tsk_size_t k; tsk_size_t time_window_index; double *afs; bool polarised = !!(options & TSK_STAT_POLARISED); const double *count_row = GET_2D_ROW(counts, num_sample_sets + 1, u); double x = 0; double t_u, t_v; double tw_branch_length = 0; const tsk_size_t all_samples = (tsk_size_t) count_row[num_sample_sets]; if (parent[u] != TSK_NULL) { t_u = time[u]; t_v = time[parent[u]]; if (0 < all_samples && all_samples < self->num_samples) { time_window_index = 0; afs_size = result_dims[num_sample_sets]; while (time_window_index < num_time_windows && time_windows[time_window_index] < t_v) { afs = result + afs_size * (window_index * num_time_windows + time_window_index); for (k = 0; k < num_sample_sets; k++) { coordinate[k] = (tsk_size_t) count_row[k]; } if (!polarised) { fold(coordinate, result_dims, num_sample_sets); } tw_branch_length = TSK_MAX(0.0, TSK_MIN(time_windows[time_window_index + 1], t_v) - TSK_MAX(time_windows[time_window_index], t_u)); x = (right - last_update[u]) * tw_branch_length; increment_nd_array_value( afs, num_sample_sets, result_dims, coordinate, x); time_window_index++; } } } last_update[u] = right; } static int tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, double *counts, tsk_size_t num_windows, const double *windows, tsk_size_t num_time_windows, const double *time_windows, const tsk_size_t *result_dims, tsk_flags_t options, double *result) { int ret = 0; tsk_id_t u, v; tsk_size_t window_index; tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_id_t num_edges = (tsk_id_t) self->tables->edges.num_rows; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_left = self->tables->edges.left; const double *restrict edge_right = self->tables->edges.right; const tsk_id_t *restrict edge_parent = self->tables->edges.parent; const tsk_id_t *restrict edge_child = self->tables->edges.child; const double *restrict node_time = self->tables->nodes.time; const double sequence_length = self->tables->sequence_length; tsk_id_t *restrict parent = tsk_malloc(num_nodes * sizeof(*parent)); double *restrict last_update = tsk_calloc(num_nodes, sizeof(*last_update)); double *restrict branch_length = tsk_calloc(num_nodes, sizeof(*branch_length)); tsk_size_t *restrict coordinate = tsk_malloc(num_sample_sets * sizeof(*coordinate)); tsk_id_t tj, tk, h; double t_left, t_right, w_right; const tsk_size_t K = num_sample_sets + 1; if (self->time_uncalibrated && !(options & TSK_STAT_ALLOW_TIME_UNCALIBRATED)) { ret = tsk_trace_error(TSK_ERR_TIME_UNCALIBRATED); goto out; } if (parent == NULL || last_update == NULL || coordinate == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(parent, 0xff, num_nodes * sizeof(*parent)); /* Iterate over the trees */ tj = 0; tk = 0; t_left = 0; window_index = 0; while (tj < num_edges || t_left < sequence_length) { tsk_bug_assert(window_index < num_windows); while (tk < num_edges && edge_right[O[tk]] == t_left) { h = O[tk]; tk++; u = edge_child[h]; v = edge_parent[h]; tsk_treeseq_update_branch_afs(self, u, t_left, last_update, node_time, parent, coordinate, counts, num_sample_sets, num_time_windows, time_windows, window_index, result_dims, options, result); while (v != TSK_NULL) { tsk_treeseq_update_branch_afs(self, v, t_left, last_update, node_time, parent, coordinate, counts, num_sample_sets, num_time_windows, time_windows, window_index, result_dims, options, result); update_state(counts, K, v, u, -1); v = parent[v]; } parent[u] = TSK_NULL; branch_length[u] = 0; } while (tj < num_edges && edge_left[I[tj]] == t_left) { h = I[tj]; tj++; u = edge_child[h]; v = edge_parent[h]; parent[u] = v; branch_length[u] = node_time[v] - node_time[u]; while (v != TSK_NULL) { tsk_treeseq_update_branch_afs(self, v, t_left, last_update, node_time, parent, coordinate, counts, num_sample_sets, num_time_windows, time_windows, window_index, result_dims, options, result); update_state(counts, K, v, u, +1); v = parent[v]; } } t_right = sequence_length; if (tj < num_edges) { t_right = TSK_MIN(t_right, edge_left[I[tj]]); } if (tk < num_edges) { t_right = TSK_MIN(t_right, edge_right[O[tk]]); } while (window_index < num_windows && windows[window_index + 1] <= t_right) { w_right = windows[window_index + 1]; /* Flush the contributions of all nodes to the current window */ for (u = 0; u < (tsk_id_t) num_nodes; u++) { tsk_bug_assert(last_update[u] < w_right); tsk_treeseq_update_branch_afs(self, u, w_right, last_update, node_time, parent, coordinate, counts, num_sample_sets, num_time_windows, time_windows, window_index, result_dims, options, result); } window_index++; } t_left = t_right; } out: /* Can't use msp_safe_free here because of restrict */ if (parent != NULL) { free(parent); } if (last_update != NULL) { free(last_update); } if (branch_length != NULL) { free(branch_length); } if (coordinate != NULL) { free(coordinate); } return ret; } int tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_size_t num_time_windows, const double *time_windows, tsk_flags_t options, double *result) { int ret = 0; bool stat_site = !!(options & TSK_STAT_SITE); bool stat_branch = !!(options & TSK_STAT_BRANCH); bool stat_node = !!(options & TSK_STAT_NODE); const double default_windows[] = { 0, self->tables->sequence_length }; const double default_time_windows[] = { 0, INFINITY }; const tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_size_t K = num_sample_sets + 1; tsk_size_t j, k, l, afs_size; tsk_id_t u; tsk_size_t *result_dims = NULL; /* These counts should really be ints, but we use doubles so that we can * reuse code from the general_stats code paths. */ double *counts = NULL; double *count_row; if (stat_node) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_STAT_MODE); goto out; } /* If no mode is specified, we default to site mode */ if (!(stat_site || stat_branch)) { stat_site = true; } /* It's an error to specify more than one mode */ if (stat_site + stat_branch > 1) { ret = tsk_trace_error(TSK_ERR_MULTIPLE_STAT_MODES); goto out; } if (windows == NULL) { num_windows = 1; windows = default_windows; } else { ret = tsk_treeseq_check_windows( self, num_windows, windows, TSK_REQUIRE_FULL_SPAN); if (ret != 0) { goto out; } } if (time_windows == NULL) { num_time_windows = 1; time_windows = default_time_windows; } else { ret = tsk_treeseq_check_time_windows(num_time_windows, time_windows); if (ret != 0) { goto out; } // Site mode does not support time windows if (stat_site && !(time_windows[0] == 0.0 && isinf((float) time_windows[1]))) { ret = TSK_ERR_UNSUPPORTED_STAT_MODE; goto out; } } ret = tsk_treeseq_check_sample_sets( self, num_sample_sets, sample_set_sizes, sample_sets); if (ret != 0) { goto out; } /* the last element of result_dims stores the total size of the dimensions */ result_dims = tsk_malloc((num_sample_sets + 1) * sizeof(*result_dims)); counts = tsk_calloc(num_nodes * K, sizeof(*counts)); if (counts == NULL || result_dims == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } afs_size = 1; j = 0; for (k = 0; k < num_sample_sets; k++) { result_dims[k] = 1 + sample_set_sizes[k]; afs_size *= result_dims[k]; for (l = 0; l < sample_set_sizes[k]; l++) { u = sample_sets[j]; count_row = GET_2D_ROW(counts, K, u); if (count_row[k] != 0) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } count_row[k] = 1; j++; } } for (j = 0; j < self->num_samples; j++) { u = self->samples[j]; count_row = GET_2D_ROW(counts, K, u); count_row[num_sample_sets] = 1; } result_dims[num_sample_sets] = (tsk_size_t) afs_size; tsk_memset(result, 0, num_windows * num_time_windows * afs_size * sizeof(*result)); if (stat_site) { ret = tsk_treeseq_site_allele_frequency_spectrum(self, num_sample_sets, sample_set_sizes, counts, num_windows, windows, result_dims, options, result); } else { ret = tsk_treeseq_branch_allele_frequency_spectrum(self, num_sample_sets, counts, num_windows, windows, num_time_windows, time_windows, result_dims, options, result); } if (options & TSK_STAT_SPAN_NORMALISE) { span_normalise(num_windows, windows, afs_size * num_time_windows, result); } out: tsk_safe_free(counts); tsk_safe_free(result_dims); return ret; } /*********************************** * One way stats ***********************************/ static int diversity_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; double n; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; result[j] = x[j] * (n - x[j]) / (n * (n - 1)); } return 0; } int tsk_treeseq_diversity(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { return tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, diversity_summary_func, num_windows, windows, options, result); } static int trait_covariance_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { weight_stat_params_t args = *(weight_stat_params_t *) params; const double n = (double) args.num_samples; const double *x = state; tsk_size_t j; for (j = 0; j < state_dim; j++) { result[j] = (x[j] * x[j]) / (2 * (n - 1) * (n - 1)); } return 0; } int tsk_treeseq_trait_covariance(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { tsk_size_t num_samples = self->num_samples; tsk_size_t j, k; int ret; const double *row; double *new_row; double *means = tsk_calloc(num_weights, sizeof(double)); double *new_weights = tsk_malloc((num_weights + 1) * num_samples * sizeof(double)); weight_stat_params_t args = { num_samples = self->num_samples }; if (new_weights == NULL || means == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (num_weights == 0) { ret = tsk_trace_error(TSK_ERR_INSUFFICIENT_WEIGHTS); goto out; } // center weights for (j = 0; j < num_samples; j++) { row = GET_2D_ROW(weights, num_weights, j); for (k = 0; k < num_weights; k++) { means[k] += row[k]; } } for (k = 0; k < num_weights; k++) { means[k] /= (double) num_samples; } for (j = 0; j < num_samples; j++) { row = GET_2D_ROW(weights, num_weights, j); new_row = GET_2D_ROW(new_weights, num_weights, j); for (k = 0; k < num_weights; k++) { new_row[k] = row[k] - means[k]; } } ret = tsk_treeseq_general_stat(self, num_weights, new_weights, num_weights, trait_covariance_summary_func, &args, num_windows, windows, options, result); out: tsk_safe_free(means); tsk_safe_free(new_weights); return ret; } static int trait_correlation_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { weight_stat_params_t args = *(weight_stat_params_t *) params; const double n = (double) args.num_samples; const double *x = state; double p; tsk_size_t j; p = x[state_dim - 1]; for (j = 0; j < state_dim - 1; j++) { if ((p > 0.0) && (p < 1.0)) { result[j] = (x[j] * x[j]) / (2 * (p * (1 - p)) * n * (n - 1)); } else { result[j] = 0.0; } } return 0; } int tsk_treeseq_trait_correlation(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { tsk_size_t num_samples = self->num_samples; tsk_size_t j, k; int ret; double *means = tsk_calloc(num_weights, sizeof(double)); double *meansqs = tsk_calloc(num_weights, sizeof(double)); double *sds = tsk_calloc(num_weights, sizeof(double)); const double *row; double *new_row; double *new_weights = tsk_malloc((num_weights + 1) * num_samples * sizeof(double)); weight_stat_params_t args = { num_samples = self->num_samples }; if (new_weights == NULL || means == NULL || meansqs == NULL || sds == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (num_weights < 1) { ret = tsk_trace_error(TSK_ERR_INSUFFICIENT_WEIGHTS); goto out; } // center and scale weights for (j = 0; j < num_samples; j++) { row = GET_2D_ROW(weights, num_weights, j); for (k = 0; k < num_weights; k++) { means[k] += row[k]; meansqs[k] += row[k] * row[k]; } } for (k = 0; k < num_weights; k++) { means[k] /= (double) num_samples; meansqs[k] -= means[k] * means[k] * (double) num_samples; meansqs[k] /= (double) (num_samples - 1); sds[k] = sqrt(meansqs[k]); } for (j = 0; j < num_samples; j++) { row = GET_2D_ROW(weights, num_weights, j); new_row = GET_2D_ROW(new_weights, num_weights + 1, j); for (k = 0; k < num_weights; k++) { new_row[k] = (row[k] - means[k]) / sds[k]; } // set final row to 1/n to compute frequency new_row[num_weights] = 1.0 / (double) num_samples; } ret = tsk_treeseq_general_stat(self, num_weights + 1, new_weights, num_weights, trait_correlation_summary_func, &args, num_windows, windows, options, result); out: tsk_safe_free(means); tsk_safe_free(meansqs); tsk_safe_free(sds); tsk_safe_free(new_weights); return ret; } static int trait_linear_model_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t result_dim, double *result, void *params) { covariates_stat_params_t args = *(covariates_stat_params_t *) params; const double num_samples = (double) args.num_samples; const tsk_size_t k = args.num_covariates; const double *V = args.V; ; const double *x = state; const double *v; double m, a, denom, z; tsk_size_t i, j; // x[0], ..., x[result_dim - 1] contains the traits, W // x[result_dim], ..., x[state_dim - 2] contains the covariates, Z // x[state_dim - 1] has the number of samples below the node m = x[state_dim - 1]; for (i = 0; i < result_dim; i++) { if ((m > 0.0) && (m < num_samples)) { v = GET_2D_ROW(V, k, i); a = x[i]; denom = m; for (j = 0; j < k; j++) { z = x[result_dim + j]; a -= z * v[j]; denom -= z * z; } // denom is the length of projection of the trait onto the subspace // spanned by the covariates, so if it is zero then the system is // singular and the solution is nonunique. This numerical tolerance // could be smaller without hitting floating-point error, but being // a tiny bit conservative about when the trait is almost in the // span of the covariates is probably good. if (denom < 1e-8) { result[i] = 0.0; } else { result[i] = (a * a) / (2 * denom * denom); } } else { result[i] = 0.0; } } return 0; } int tsk_treeseq_trait_linear_model(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_covariates, const double *covariates, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { tsk_size_t num_samples = self->num_samples; tsk_size_t i, j, k; int ret; const double *w, *z; double *v, *new_row; double *V = tsk_calloc(num_covariates * num_weights, sizeof(double)); double *new_weights = tsk_malloc((num_weights + num_covariates + 1) * num_samples * sizeof(double)); covariates_stat_params_t args = { .num_samples = self->num_samples, .num_covariates = num_covariates, .V = V }; // We assume that the covariates have been *already standardised*, // so that (a) 1 is in the span of the columns, and // (b) their crossproduct is the identity. // We could do this instead here with gsl linalg. if (new_weights == NULL || V == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (num_weights < 1) { ret = tsk_trace_error(TSK_ERR_INSUFFICIENT_WEIGHTS); goto out; } // V = weights^T (matrix mult) covariates for (k = 0; k < num_samples; k++) { w = GET_2D_ROW(weights, num_weights, k); z = GET_2D_ROW(covariates, num_covariates, k); for (i = 0; i < num_weights; i++) { v = GET_2D_ROW(V, num_covariates, i); for (j = 0; j < num_covariates; j++) { v[j] += w[i] * z[j]; } } } for (k = 0; k < num_samples; k++) { w = GET_2D_ROW(weights, num_weights, k); z = GET_2D_ROW(covariates, num_covariates, k); new_row = GET_2D_ROW(new_weights, num_covariates + num_weights + 1, k); for (i = 0; i < num_weights; i++) { new_row[i] = w[i]; } for (i = 0; i < num_covariates; i++) { new_row[i + num_weights] = z[i]; } // set final row to 1 to count alleles new_row[num_weights + num_covariates] = 1.0; } ret = tsk_treeseq_general_stat(self, num_weights + num_covariates + 1, new_weights, num_weights, trait_linear_model_summary_func, &args, num_windows, windows, options, result); out: tsk_safe_free(V); tsk_safe_free(new_weights); return ret; } static int segregating_sites_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; double n; tsk_size_t j; // this works because sum_{i=1}^k (1-p_i) = k-1 for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; result[j] = (x[j] > 0) * (1 - x[j] / n); } return 0; } int tsk_treeseq_segregating_sites(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { return tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, segregating_sites_summary_func, num_windows, windows, options, result); } static int Y1_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; double ni, denom, numer; tsk_size_t i; for (i = 0; i < result_dim; i++) { ni = (double) args.sample_set_sizes[i]; denom = ni * (ni - 1) * (ni - 2); numer = x[i] * (ni - x[i]) * (ni - x[i] - 1); result[i] = numer / denom; } return 0; } int tsk_treeseq_Y1(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { return tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, Y1_summary_func, num_windows, windows, options, result); } static int D_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; double n; const double *state_row; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); double p_AB = state_row[0] / n; double p_Ab = state_row[1] / n; double p_aB = state_row[2] / n; double p_A = p_AB + p_Ab; double p_B = p_AB + p_aB; result[j] = p_AB - (p_A * p_B); } return 0; } int tsk_treeseq_D(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { options |= TSK_STAT_POLARISED; // TODO: allow user to pick? return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, D_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } static int D2_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; double n; const double *state_row; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); double p_AB = state_row[0] / n; double p_Ab = state_row[1] / n; double p_aB = state_row[2] / n; double p_A = p_AB + p_Ab; double p_B = p_AB + p_aB; result[j] = p_AB - (p_A * p_B); result[j] *= result[j]; } return 0; } int tsk_treeseq_D2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, D2_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } static int r2_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; double n; const double *state_row; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); double p_AB = state_row[0] / n; double p_Ab = state_row[1] / n; double p_aB = state_row[2] / n; double p_A = p_AB + p_Ab; double p_B = p_AB + p_aB; double D = p_AB - (p_A * p_B); double denom = p_A * p_B * (1 - p_A) * (1 - p_B); result[j] = (D * D) / denom; } return 0; } int tsk_treeseq_r2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, r2_summary_func, norm_hap_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } static int D_prime_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; double n; const double *state_row; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); double p_AB = state_row[0] / n; double p_Ab = state_row[1] / n; double p_aB = state_row[2] / n; double p_A = p_AB + p_Ab; double p_B = p_AB + p_aB; double D = p_AB - (p_A * p_B); if (D >= 0) { result[j] = D / TSK_MIN(p_A * (1 - p_B), (1 - p_A) * p_B); } else if (D < 0) { result[j] = D / TSK_MIN(p_A * p_B, (1 - p_A) * (1 - p_B)); } } return 0; } int tsk_treeseq_D_prime(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { options |= TSK_STAT_POLARISED; // TODO: allow user to pick? return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, D_prime_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } static int r_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; double n; const double *state_row; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); double p_AB = state_row[0] / n; double p_Ab = state_row[1] / n; double p_aB = state_row[2] / n; double p_A = p_AB + p_Ab; double p_B = p_AB + p_aB; double D = p_AB - (p_A * p_B); double denom = p_A * p_B * (1 - p_A) * (1 - p_B); result[j] = D / sqrt(denom); } return 0; } int tsk_treeseq_r(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { options |= TSK_STAT_POLARISED; // TODO: allow user to pick? return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, r_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } static int Dz_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; double n; const double *state_row; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); double p_AB = state_row[0] / n; double p_Ab = state_row[1] / n; double p_aB = state_row[2] / n; double p_A = p_AB + p_Ab; double p_B = p_AB + p_aB; double D = p_AB - (p_A * p_B); result[j] = D * (1 - 2 * p_A) * (1 - 2 * p_B); } return 0; } int tsk_treeseq_Dz(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, Dz_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } static int pi2_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; double n; const double *state_row; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); double p_AB = state_row[0] / n; double p_Ab = state_row[1] / n; double p_aB = state_row[2] / n; double p_A = p_AB + p_Ab; double p_B = p_AB + p_aB; result[j] = p_A * (1 - p_A) * p_B * (1 - p_B); } return 0; } int tsk_treeseq_pi2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, pi2_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } static int D2_unbiased_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; double n; const double *state_row; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); double w_AB = state_row[0]; double w_Ab = state_row[1]; double w_aB = state_row[2]; double w_ab = n - (w_AB + w_Ab + w_aB); result[j] = (1 / (n * (n - 1) * (n - 2) * (n - 3))) * ((w_aB * w_aB * (w_Ab - 1) * w_Ab) + ((w_ab - 1) * w_ab * (w_AB - 1) * w_AB) - (w_aB * w_Ab * (w_Ab + (2 * w_ab * w_AB) - 1))); } return 0; } int tsk_treeseq_D2_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, D2_unbiased_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } static int Dz_unbiased_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; double n; const double *state_row; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); double w_AB = state_row[0]; double w_Ab = state_row[1]; double w_aB = state_row[2]; double w_ab = n - (w_AB + w_Ab + w_aB); result[j] = (1 / (n * (n - 1) * (n - 2) * (n - 3))) * ((((w_AB * w_ab) - (w_Ab * w_aB)) * (w_aB + w_ab - w_AB - w_Ab) * (w_Ab + w_ab - w_AB - w_aB)) - ((w_AB * w_ab) * (w_AB + w_ab - w_Ab - w_aB - 2)) - ((w_Ab * w_aB) * (w_Ab + w_aB - w_AB - w_ab - 2))); } return 0; } int tsk_treeseq_Dz_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, Dz_unbiased_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } static int pi2_unbiased_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t TSK_UNUSED(result_dim), double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; double n; const double *state_row; tsk_size_t j; for (j = 0; j < state_dim; j++) { n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); double w_AB = state_row[0]; double w_Ab = state_row[1]; double w_aB = state_row[2]; double w_ab = n - (w_AB + w_Ab + w_aB); result[j] = (1 / (n * (n - 1) * (n - 2) * (n - 3))) * (((w_AB + w_Ab) * (w_aB + w_ab) * (w_AB + w_aB) * (w_Ab + w_ab)) - ((w_AB * w_ab) * (w_AB + w_ab + (3 * w_Ab) + (3 * w_aB) - 1)) - ((w_Ab * w_aB) * (w_Ab + w_aB + (3 * w_AB) + (3 * w_ab) - 1))); } return 0; } int tsk_treeseq_pi2_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { return tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_sample_sets, NULL, pi2_unbiased_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); } /*********************************** * Two way stats ***********************************/ static int check_sample_stat_inputs(tsk_size_t num_sample_sets, tsk_size_t tuple_size, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples) { int ret = 0; if (num_sample_sets < 1) { ret = tsk_trace_error(TSK_ERR_INSUFFICIENT_SAMPLE_SETS); goto out; } if (num_index_tuples < 1) { ret = tsk_trace_error(TSK_ERR_INSUFFICIENT_INDEX_TUPLES); goto out; } ret = check_set_indexes( num_sample_sets, tuple_size * num_index_tuples, index_tuples); if (ret != 0) { goto out; } out: return ret; } static int divergence_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; double ni, nj, denom; tsk_id_t i, j; tsk_size_t k; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; ni = (double) args.sample_set_sizes[i]; nj = (double) args.sample_set_sizes[j]; denom = ni * (nj - (i == j)); result[k] = x[i] * (nj - x[j]) / denom; } return 0; } int tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); if (ret != 0) { goto out; } ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, divergence_summary_func, num_windows, windows, options, result); out: return ret; } static int genetic_relatedness_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; tsk_id_t i, j; tsk_size_t k; double sumx = 0; double meanx, ni, nj; for (k = 0; k < state_dim; k++) { sumx += x[k] / (double) args.sample_set_sizes[k]; } meanx = sumx / (double) state_dim; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; ni = (double) args.sample_set_sizes[i]; nj = (double) args.sample_set_sizes[j]; result[k] = (x[i] / ni - meanx) * (x[j] / nj - meanx); } return 0; } static int genetic_relatedness_noncentred_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; tsk_id_t i, j; tsk_size_t k; double ni, nj; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; ni = (double) args.sample_set_sizes[i]; nj = (double) args.sample_set_sizes[j]; result[k] = x[i] * x[j] / (ni * nj); } return 0; } int tsk_treeseq_genetic_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); if (ret != 0) { goto out; } if (!(options & TSK_STAT_NONCENTRED)) { ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, genetic_relatedness_summary_func, num_windows, windows, options, result); } else { ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, genetic_relatedness_noncentred_summary_func, num_windows, windows, options, result); } out: return ret; } static int genetic_relatedness_weighted_summary_func(tsk_size_t state_dim, const double *state, tsk_size_t result_dim, double *result, void *params) { indexed_weight_stat_params_t args = *(indexed_weight_stat_params_t *) params; const double *x = state; tsk_id_t i, j; tsk_size_t k; double pn, ni, nj; pn = state[state_dim - 1]; for (k = 0; k < result_dim; k++) { i = args.index_tuples[2 * k]; j = args.index_tuples[2 * k + 1]; ni = args.total_weights[i]; nj = args.total_weights[j]; result[k] = (x[i] - ni * pn) * (x[j] - nj * pn); } return 0; } static int genetic_relatedness_weighted_noncentred_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { indexed_weight_stat_params_t args = *(indexed_weight_stat_params_t *) params; const double *x = state; tsk_id_t i, j; tsk_size_t k; for (k = 0; k < result_dim; k++) { i = args.index_tuples[2 * k]; j = args.index_tuples[2 * k + 1]; result[k] = x[i] * x[j]; } return 0; } int tsk_treeseq_genetic_relatedness_weighted(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result, tsk_flags_t options) { int ret = 0; tsk_size_t num_samples = self->num_samples; size_t j, k; indexed_weight_stat_params_t args; const double *row; double *new_row; double *total_weights = tsk_calloc((num_weights + 1), sizeof(*total_weights)); double *new_weights = tsk_malloc((num_weights + 1) * num_samples * sizeof(*new_weights)); if (total_weights == NULL || new_weights == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (num_weights == 0) { ret = tsk_trace_error(TSK_ERR_INSUFFICIENT_WEIGHTS); goto out; } // Add a column of ones to W for (j = 0; j < num_samples; j++) { row = GET_2D_ROW(weights, num_weights, j); new_row = GET_2D_ROW(new_weights, num_weights + 1, j); for (k = 0; k < num_weights; k++) { new_row[k] = row[k]; total_weights[k] += row[k]; } new_row[num_weights] = 1.0 / (double) num_samples; } total_weights[num_weights] = 1.0; args.total_weights = total_weights; args.index_tuples = index_tuples; if (!(options & TSK_STAT_NONCENTRED)) { ret = tsk_treeseq_general_stat(self, num_weights + 1, new_weights, num_index_tuples, genetic_relatedness_weighted_summary_func, &args, num_windows, windows, options, result); if (ret != 0) { goto out; } } else { ret = tsk_treeseq_general_stat(self, num_weights + 1, new_weights, num_index_tuples, genetic_relatedness_weighted_noncentred_summary_func, &args, num_windows, windows, options, result); if (ret != 0) { goto out; } } out: tsk_safe_free(total_weights); tsk_safe_free(new_weights); return ret; } static int Y2_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; double ni, nj, denom; tsk_id_t i, j; tsk_size_t k; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; ni = (double) args.sample_set_sizes[i]; nj = (double) args.sample_set_sizes[j]; denom = ni * nj * (nj - 1); result[k] = x[i] * (nj - x[j]) * (nj - x[j] - 1) / denom; } return 0; } int tsk_treeseq_Y2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); if (ret != 0) { goto out; } ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, Y2_summary_func, num_windows, windows, options, result); out: return ret; } static int f2_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; double ni, nj, denom, numer; tsk_id_t i, j; tsk_size_t k; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; ni = (double) args.sample_set_sizes[i]; nj = (double) args.sample_set_sizes[j]; denom = ni * (ni - 1) * nj * (nj - 1); numer = x[i] * (x[i] - 1) * (nj - x[j]) * (nj - x[j] - 1) - x[i] * (ni - x[i]) * (nj - x[j]) * x[j]; result[k] = numer / denom; } return 0; } int tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); if (ret != 0) { goto out; } ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, f2_summary_func, num_windows, windows, options, result); out: return ret; } static int D2_ij_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *state_row; double n; tsk_size_t k; tsk_id_t i, j; double p_A, p_B, p_AB, p_Ab, p_aB, D_i, D_j; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; n = (double) args.sample_set_sizes[i]; state_row = GET_2D_ROW(state, 3, i); p_AB = state_row[0] / n; p_Ab = state_row[1] / n; p_aB = state_row[2] / n; p_A = p_AB + p_Ab; p_B = p_AB + p_aB; D_i = p_AB - (p_A * p_B); n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); p_AB = state_row[0] / n; p_Ab = state_row[1] / n; p_aB = state_row[2] / n; p_A = p_AB + p_Ab; p_B = p_AB + p_aB; D_j = p_AB - (p_A * p_B); result[k] = D_i * D_j; } return 0; } int tsk_treeseq_D2_ij(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); if (ret != 0) { goto out; } ret = tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, D2_ij_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); out: return ret; } static int D2_ij_unbiased_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *state_row; tsk_size_t k; tsk_id_t i, j; double n_i, n_j; double w_AB_i, w_Ab_i, w_aB_i, w_ab_i; double w_AB_j, w_Ab_j, w_aB_j, w_ab_j; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; if (i == j) { // We require disjoint sample sets because we test equality here n_i = (double) args.sample_set_sizes[i]; state_row = GET_2D_ROW(state, 3, i); w_AB_i = state_row[0]; w_Ab_i = state_row[1]; w_aB_i = state_row[2]; w_ab_i = n_i - (w_AB_i + w_Ab_i + w_aB_i); result[k] = (w_AB_i * (w_AB_i - 1) * w_ab_i * (w_ab_i - 1) + w_Ab_i * (w_Ab_i - 1) * w_aB_i * (w_aB_i - 1) - 2 * w_AB_i * w_Ab_i * w_aB_i * w_ab_i) / n_i / (n_i - 1) / (n_i - 2) / (n_i - 3); } else { n_i = (double) args.sample_set_sizes[i]; state_row = GET_2D_ROW(state, 3, i); w_AB_i = state_row[0]; w_Ab_i = state_row[1]; w_aB_i = state_row[2]; w_ab_i = n_i - (w_AB_i + w_Ab_i + w_aB_i); n_j = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); w_AB_j = state_row[0]; w_Ab_j = state_row[1]; w_aB_j = state_row[2]; w_ab_j = n_j - (w_AB_j + w_Ab_j + w_aB_j); result[k] = (w_Ab_i * w_aB_i - w_AB_i * w_ab_i) * (w_Ab_j * w_aB_j - w_AB_j * w_ab_j) / n_i / (n_i - 1) / n_j / (n_j - 1); } } return 0; } int tsk_treeseq_D2_ij_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); if (ret != 0) { goto out; } ret = tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, D2_ij_unbiased_summary_func, norm_total_weighted, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); out: return ret; } static int r2_ij_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *state_row; tsk_size_t k; tsk_id_t i, j; double n, pAB, pAb, paB, pA, pB, D_i, D_j, denom_i, denom_j; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; n = (double) args.sample_set_sizes[i]; state_row = GET_2D_ROW(state, 3, i); pAB = state_row[0] / n; pAb = state_row[1] / n; paB = state_row[2] / n; pA = pAB + pAb; pB = pAB + paB; D_i = pAB - (pA * pB); denom_i = sqrt(pA * (1 - pA) * pB * (1 - pB)); n = (double) args.sample_set_sizes[j]; state_row = GET_2D_ROW(state, 3, j); pAB = state_row[0] / n; pAb = state_row[1] / n; paB = state_row[2] / n; pA = pAB + pAb; pB = pAB + paB; D_j = pAB - (pA * pB); denom_j = sqrt(pA * (1 - pA) * pB * (1 - pB)); result[k] = (D_i * D_j) / (denom_i * denom_j); } return 0; } int tsk_treeseq_r2_ij(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); if (ret != 0) { goto out; } ret = tsk_treeseq_two_locus_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, r2_ij_summary_func, norm_hap_weighted_ij, num_rows, row_sites, row_positions, num_cols, col_sites, col_positions, options, result); out: return ret; } /*********************************** * Three way stats ***********************************/ static int Y3_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; double ni, nj, nk, denom, numer; tsk_id_t i, j, k; tsk_size_t tuple_index; for (tuple_index = 0; tuple_index < result_dim; tuple_index++) { i = args.set_indexes[3 * tuple_index]; j = args.set_indexes[3 * tuple_index + 1]; k = args.set_indexes[3 * tuple_index + 2]; ni = (double) args.sample_set_sizes[i]; nj = (double) args.sample_set_sizes[j]; nk = (double) args.sample_set_sizes[k]; denom = ni * nj * nk; numer = x[i] * (nj - x[j]) * (nk - x[k]); result[tuple_index] = numer / denom; } return 0; } int tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 3, num_index_tuples, index_tuples); if (ret != 0) { goto out; } ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, Y3_summary_func, num_windows, windows, options, result); out: return ret; } static int f3_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; double ni, nj, nk, denom, numer; tsk_id_t i, j, k; tsk_size_t tuple_index; for (tuple_index = 0; tuple_index < result_dim; tuple_index++) { i = args.set_indexes[3 * tuple_index]; j = args.set_indexes[3 * tuple_index + 1]; k = args.set_indexes[3 * tuple_index + 2]; ni = (double) args.sample_set_sizes[i]; nj = (double) args.sample_set_sizes[j]; nk = (double) args.sample_set_sizes[k]; denom = ni * (ni - 1) * nj * nk; numer = x[i] * (x[i] - 1) * (nj - x[j]) * (nk - x[k]) - x[i] * (ni - x[i]) * (nj - x[j]) * x[k]; result[tuple_index] = numer / denom; } return 0; } int tsk_treeseq_f3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 3, num_index_tuples, index_tuples); if (ret != 0) { goto out; } ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, f3_summary_func, num_windows, windows, options, result); out: return ret; } /*********************************** * Four way stats ***********************************/ static int f4_summary_func(tsk_size_t TSK_UNUSED(state_dim), const double *state, tsk_size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; double ni, nj, nk, nl, denom, numer; tsk_id_t i, j, k, l; tsk_size_t tuple_index; for (tuple_index = 0; tuple_index < result_dim; tuple_index++) { i = args.set_indexes[4 * tuple_index]; j = args.set_indexes[4 * tuple_index + 1]; k = args.set_indexes[4 * tuple_index + 2]; l = args.set_indexes[4 * tuple_index + 3]; ni = (double) args.sample_set_sizes[i]; nj = (double) args.sample_set_sizes[j]; nk = (double) args.sample_set_sizes[k]; nl = (double) args.sample_set_sizes[l]; denom = ni * nj * nk * nl; numer = x[i] * x[k] * (nj - x[j]) * (nl - x[l]) - x[i] * x[l] * (nj - x[j]) * (nk - x[k]); result[tuple_index] = numer / denom; } return 0; } int tsk_treeseq_f4(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 4, num_index_tuples, index_tuples); if (ret != 0) { goto out; } ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, f4_summary_func, num_windows, windows, options, result); out: return ret; } /* Error-raising getter functions */ int TSK_WARN_UNUSED tsk_treeseq_get_node(const tsk_treeseq_t *self, tsk_id_t index, tsk_node_t *node) { return tsk_node_table_get_row(&self->tables->nodes, index, node); } int TSK_WARN_UNUSED tsk_treeseq_get_edge(const tsk_treeseq_t *self, tsk_id_t index, tsk_edge_t *edge) { return tsk_edge_table_get_row(&self->tables->edges, index, edge); } int TSK_WARN_UNUSED tsk_treeseq_get_migration( const tsk_treeseq_t *self, tsk_id_t index, tsk_migration_t *migration) { return tsk_migration_table_get_row(&self->tables->migrations, index, migration); } int TSK_WARN_UNUSED tsk_treeseq_get_mutation( const tsk_treeseq_t *self, tsk_id_t index, tsk_mutation_t *mutation) { int ret = 0; ret = tsk_mutation_table_get_row(&self->tables->mutations, index, mutation); if (ret != 0) { goto out; } mutation->edge = self->site_mutations_mem[index].edge; mutation->inherited_state = self->site_mutations_mem[index].inherited_state; mutation->inherited_state_length = self->site_mutations_mem[index].inherited_state_length; out: return ret; } int TSK_WARN_UNUSED tsk_treeseq_get_site(const tsk_treeseq_t *self, tsk_id_t index, tsk_site_t *site) { int ret = 0; ret = tsk_site_table_get_row(&self->tables->sites, index, site); if (ret != 0) { goto out; } site->mutations = self->site_mutations[index]; site->mutations_length = self->site_mutations_length[index]; out: return ret; } int TSK_WARN_UNUSED tsk_treeseq_get_individual( const tsk_treeseq_t *self, tsk_id_t index, tsk_individual_t *individual) { int ret = 0; ret = tsk_individual_table_get_row(&self->tables->individuals, index, individual); if (ret != 0) { goto out; } individual->nodes = self->individual_nodes[index]; individual->nodes_length = self->individual_nodes_length[index]; out: return ret; } int TSK_WARN_UNUSED tsk_treeseq_get_population( const tsk_treeseq_t *self, tsk_id_t index, tsk_population_t *population) { return tsk_population_table_get_row(&self->tables->populations, index, population); } int TSK_WARN_UNUSED tsk_treeseq_get_provenance( const tsk_treeseq_t *self, tsk_id_t index, tsk_provenance_t *provenance) { return tsk_provenance_table_get_row(&self->tables->provenances, index, provenance); } int TSK_WARN_UNUSED tsk_treeseq_simplify(const tsk_treeseq_t *self, const tsk_id_t *samples, tsk_size_t num_samples, tsk_flags_t options, tsk_treeseq_t *output, tsk_id_t *node_map) { int ret = 0; tsk_table_collection_t *tables = tsk_malloc(sizeof(*tables)); if (tables == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_treeseq_copy_tables(self, tables, 0); if (ret != 0) { goto out; } ret = tsk_table_collection_simplify(tables, samples, num_samples, options, node_map); if (ret != 0) { goto out; } ret = tsk_treeseq_init( output, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TAKE_OWNERSHIP); /* Once tsk_treeseq_init has returned ownership of tables is transferred */ tables = NULL; out: if (tables != NULL) { tsk_table_collection_free(tables); tsk_safe_free(tables); } return ret; } int TSK_WARN_UNUSED tsk_treeseq_split_edges(const tsk_treeseq_t *self, double time, tsk_flags_t flags, tsk_id_t population, const char *metadata, tsk_size_t metadata_length, tsk_flags_t TSK_UNUSED(options), tsk_treeseq_t *output) { int ret = 0; tsk_table_collection_t *tables = tsk_malloc(sizeof(*tables)); const double *restrict node_time = self->tables->nodes.time; const tsk_size_t num_edges = self->tables->edges.num_rows; const tsk_size_t num_mutations = self->tables->mutations.num_rows; tsk_id_t *split_edge = tsk_malloc(num_edges * sizeof(*split_edge)); tsk_id_t j, u, mapped_node, ret_id; double mutation_time; tsk_edge_t edge; tsk_mutation_t mutation; tsk_bookmark_t sort_start; memset(output, 0, sizeof(*output)); if (split_edge == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_treeseq_copy_tables(self, tables, 0); if (ret != 0) { goto out; } if (tables->migrations.num_rows > 0) { ret = tsk_trace_error(TSK_ERR_MIGRATIONS_NOT_SUPPORTED); goto out; } /* We could catch this below in add_row, but it's simpler to guarantee * that we always catch the error in corner cases where the values * aren't used. */ if (population < -1 || population >= (tsk_id_t) self->tables->populations.num_rows) { ret = tsk_trace_error(TSK_ERR_POPULATION_OUT_OF_BOUNDS); goto out; } if (!tsk_isfinite(time)) { ret = tsk_trace_error(TSK_ERR_TIME_NONFINITE); goto out; } tsk_edge_table_clear(&tables->edges); tsk_memset(split_edge, TSK_NULL, num_edges * sizeof(*split_edge)); for (j = 0; j < (tsk_id_t) num_edges; j++) { /* Would prefer to use tsk_edge_table_get_row_unsafe, but it's * currently static to tables.c */ ret = tsk_edge_table_get_row(&self->tables->edges, j, &edge); tsk_bug_assert(ret == 0); if (node_time[edge.child] < time && time < node_time[edge.parent]) { u = tsk_node_table_add_row(&tables->nodes, flags, time, population, TSK_NULL, metadata, metadata_length); if (u < 0) { ret = (int) u; goto out; } ret_id = tsk_edge_table_add_row(&tables->edges, edge.left, edge.right, u, edge.child, edge.metadata, edge.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } edge.child = u; split_edge[j] = u; } ret_id = tsk_edge_table_add_row(&tables->edges, edge.left, edge.right, edge.parent, edge.child, edge.metadata, edge.metadata_length); if (ret_id < 0) { ret = (int) ret_id; goto out; } } for (j = 0; j < (tsk_id_t) num_mutations; j++) { /* Note: we could speed this up a bit by accessing the local * memory for mutations directly. */ ret = tsk_treeseq_get_mutation(self, j, &mutation); tsk_bug_assert(ret == 0); mapped_node = TSK_NULL; if (mutation.edge != TSK_NULL) { mapped_node = split_edge[mutation.edge]; } mutation_time = tsk_is_unknown_time(mutation.time) ? node_time[mutation.node] : mutation.time; if (mapped_node != TSK_NULL && mutation_time >= time) { /* Update the column in-place to save a bit of time. */ tables->mutations.node[j] = mapped_node; } } /* Skip mutations and sites as they haven't been altered */ /* Note we can probably optimise the edge sort a bit here also by * reasoning about when the first edge gets altered in the table. */ memset(&sort_start, 0, sizeof(sort_start)); sort_start.sites = tables->sites.num_rows; sort_start.mutations = tables->mutations.num_rows; ret = tsk_table_collection_sort(tables, &sort_start, 0); if (ret != 0) { goto out; } ret = tsk_treeseq_init( output, tables, TSK_TS_INIT_BUILD_INDEXES | TSK_TAKE_OWNERSHIP); tables = NULL; out: if (tables != NULL) { tsk_table_collection_free(tables); tsk_safe_free(tables); } tsk_safe_free(split_edge); return ret; } /* ======================================================== * * tree_position * ======================================================== */ static void tsk_tree_position_set_null(tsk_tree_position_t *self) { self->index = -1; self->interval.left = 0; self->interval.right = 0; } int tsk_tree_position_init(tsk_tree_position_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t TSK_UNUSED(options)) { memset(self, 0, sizeof(*self)); self->tree_sequence = tree_sequence; tsk_tree_position_set_null(self); return 0; } int tsk_tree_position_free(tsk_tree_position_t *TSK_UNUSED(self)) { return 0; } int tsk_tree_position_print_state(const tsk_tree_position_t *self, FILE *out) { fprintf(out, "Tree position state\n"); fprintf(out, "index = %d\n", (int) self->index); fprintf(out, "interval = [%f,\t%f)\n", self->interval.left, self->interval.right); fprintf( out, "out = start=%d\tstop=%d\n", (int) self->out.start, (int) self->out.stop); fprintf( out, "in = start=%d\tstop=%d\n", (int) self->in.start, (int) self->in.stop); return 0; } bool tsk_tree_position_next(tsk_tree_position_t *self) { const tsk_table_collection_t *tables = self->tree_sequence->tables; const tsk_id_t M = (tsk_id_t) tables->edges.num_rows; const tsk_id_t num_trees = (tsk_id_t) self->tree_sequence->num_trees; const double *restrict left_coords = tables->edges.left; const tsk_id_t *restrict left_order = tables->indexes.edge_insertion_order; const double *restrict right_coords = tables->edges.right; const tsk_id_t *restrict right_order = tables->indexes.edge_removal_order; const double *restrict breakpoints = self->tree_sequence->breakpoints; tsk_id_t j, left_current_index, right_current_index; double left; if (self->index == -1) { self->interval.right = 0; self->in.stop = 0; self->out.stop = 0; self->direction = TSK_DIR_FORWARD; } if (self->direction == TSK_DIR_FORWARD) { left_current_index = self->in.stop; right_current_index = self->out.stop; } else { left_current_index = self->out.stop + 1; right_current_index = self->in.stop + 1; } left = self->interval.right; j = right_current_index; self->out.start = j; while (j < M && right_coords[right_order[j]] == left) { j++; } self->out.stop = j; self->out.order = right_order; j = left_current_index; self->in.start = j; while (j < M && left_coords[left_order[j]] == left) { j++; } self->in.stop = j; self->in.order = left_order; self->direction = TSK_DIR_FORWARD; self->index++; if (self->index == num_trees) { tsk_tree_position_set_null(self); } else { self->interval.left = left; self->interval.right = breakpoints[self->index + 1]; } return self->index != -1; } bool tsk_tree_position_prev(tsk_tree_position_t *self) { const tsk_table_collection_t *tables = self->tree_sequence->tables; const tsk_id_t M = (tsk_id_t) tables->edges.num_rows; const double sequence_length = tables->sequence_length; const tsk_id_t num_trees = (tsk_id_t) self->tree_sequence->num_trees; const double *restrict left_coords = tables->edges.left; const tsk_id_t *restrict left_order = tables->indexes.edge_insertion_order; const double *restrict right_coords = tables->edges.right; const tsk_id_t *restrict right_order = tables->indexes.edge_removal_order; const double *restrict breakpoints = self->tree_sequence->breakpoints; tsk_id_t j, left_current_index, right_current_index; double right; if (self->index == -1) { self->index = num_trees; self->interval.left = sequence_length; self->in.stop = M - 1; self->out.stop = M - 1; self->direction = TSK_DIR_REVERSE; } if (self->direction == TSK_DIR_REVERSE) { left_current_index = self->out.stop; right_current_index = self->in.stop; } else { left_current_index = self->in.stop - 1; right_current_index = self->out.stop - 1; } right = self->interval.left; j = left_current_index; self->out.start = j; while (j >= 0 && left_coords[left_order[j]] == right) { j--; } self->out.stop = j; self->out.order = left_order; j = right_current_index; self->in.start = j; while (j >= 0 && right_coords[right_order[j]] == right) { j--; } self->in.stop = j; self->in.order = right_order; self->index--; self->direction = TSK_DIR_REVERSE; if (self->index == -1) { tsk_tree_position_set_null(self); } else { self->interval.left = breakpoints[self->index]; self->interval.right = right; } return self->index != -1; } int TSK_WARN_UNUSED tsk_tree_position_seek_forward(tsk_tree_position_t *self, tsk_id_t index) { int ret = 0; const tsk_table_collection_t *tables = self->tree_sequence->tables; const tsk_id_t M = (tsk_id_t) tables->edges.num_rows; const tsk_id_t num_trees = (tsk_id_t) self->tree_sequence->num_trees; const double *restrict left_coords = tables->edges.left; const tsk_id_t *restrict left_order = tables->indexes.edge_insertion_order; const double *restrict right_coords = tables->edges.right; const tsk_id_t *restrict right_order = tables->indexes.edge_removal_order; const double *restrict breakpoints = self->tree_sequence->breakpoints; tsk_id_t j, left_current_index, right_current_index; double left; tsk_bug_assert(index >= self->index && index < num_trees); if (self->index == -1) { self->interval.right = 0; self->in.stop = 0; self->out.stop = 0; self->direction = TSK_DIR_FORWARD; } if (self->direction == TSK_DIR_FORWARD) { left_current_index = self->in.stop; right_current_index = self->out.stop; } else { left_current_index = self->out.stop + 1; right_current_index = self->in.stop + 1; } self->direction = TSK_DIR_FORWARD; left = breakpoints[index]; j = right_current_index; self->out.start = j; while (j < M && right_coords[right_order[j]] <= left) { j++; } self->out.stop = j; if (self->index == -1) { self->out.start = self->out.stop; } j = left_current_index; while (j < M && right_coords[left_order[j]] <= left) { j++; } self->in.start = j; while (j < M && left_coords[left_order[j]] <= left) { j++; } self->in.stop = j; self->interval.left = left; self->interval.right = breakpoints[index + 1]; self->out.order = right_order; self->in.order = left_order; self->index = index; return ret; } int TSK_WARN_UNUSED tsk_tree_position_seek_backward(tsk_tree_position_t *self, tsk_id_t index) { int ret = 0; const tsk_table_collection_t *tables = self->tree_sequence->tables; const tsk_id_t M = (tsk_id_t) tables->edges.num_rows; const double sequence_length = tables->sequence_length; const tsk_id_t num_trees = (tsk_id_t) self->tree_sequence->num_trees; const double *restrict left_coords = tables->edges.left; const tsk_id_t *restrict left_order = tables->indexes.edge_insertion_order; const double *restrict right_coords = tables->edges.right; const tsk_id_t *restrict right_order = tables->indexes.edge_removal_order; const double *restrict breakpoints = self->tree_sequence->breakpoints; tsk_id_t j, left_current_index, right_current_index; double right; if (self->index == -1) { self->index = num_trees; self->interval.left = sequence_length; self->in.stop = M - 1; self->out.stop = M - 1; self->direction = TSK_DIR_REVERSE; } tsk_bug_assert(index <= self->index); if (self->direction == TSK_DIR_REVERSE) { left_current_index = self->out.stop; right_current_index = self->in.stop; } else { left_current_index = self->in.stop - 1; right_current_index = self->out.stop - 1; } self->direction = TSK_DIR_REVERSE; right = breakpoints[index + 1]; j = left_current_index; self->out.start = j; while (j >= 0 && left_coords[left_order[j]] >= right) { j--; } self->out.stop = j; if (self->index == num_trees) { self->out.start = self->out.stop; } j = right_current_index; while (j >= 0 && left_coords[right_order[j]] >= right) { j--; } self->in.start = j; while (j >= 0 && right_coords[right_order[j]] >= right) { j--; } self->in.stop = j; self->interval.right = right; self->interval.left = breakpoints[index]; self->out.order = left_order; self->in.order = right_order; self->index = index; return ret; } /* ======================================================== * * Tree * ======================================================== */ /* Return the root for the specified node. * NOTE: no bounds checking is done here. */ static tsk_id_t tsk_tree_get_node_root(const tsk_tree_t *self, tsk_id_t u) { const tsk_id_t *restrict parent = self->parent; while (parent[u] != TSK_NULL) { u = parent[u]; } return u; } int TSK_WARN_UNUSED tsk_tree_init(tsk_tree_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options) { int ret = 0; tsk_size_t num_samples, num_nodes, N; tsk_memset(self, 0, sizeof(tsk_tree_t)); if (tree_sequence == NULL) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } num_nodes = tree_sequence->tables->nodes.num_rows; num_samples = tree_sequence->num_samples; self->num_nodes = num_nodes; self->virtual_root = (tsk_id_t) num_nodes; self->tree_sequence = tree_sequence; self->samples = tree_sequence->samples; self->options = options; self->root_threshold = 1; /* Allocate space in the quintuply linked tree for the virtual root */ N = num_nodes + 1; self->parent = tsk_malloc(N * sizeof(*self->parent)); self->left_child = tsk_malloc(N * sizeof(*self->left_child)); self->right_child = tsk_malloc(N * sizeof(*self->right_child)); self->left_sib = tsk_malloc(N * sizeof(*self->left_sib)); self->right_sib = tsk_malloc(N * sizeof(*self->right_sib)); self->num_children = tsk_calloc(N, sizeof(*self->num_children)); self->edge = tsk_malloc(N * sizeof(*self->edge)); if (self->parent == NULL || self->left_child == NULL || self->right_child == NULL || self->left_sib == NULL || self->right_sib == NULL || self->num_children == NULL || self->edge == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { self->num_samples = tsk_calloc(N, sizeof(*self->num_samples)); self->num_tracked_samples = tsk_calloc(N, sizeof(*self->num_tracked_samples)); if (self->num_samples == NULL || self->num_tracked_samples == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } } if (self->options & TSK_SAMPLE_LISTS) { self->left_sample = tsk_malloc(N * sizeof(*self->left_sample)); self->right_sample = tsk_malloc(N * sizeof(*self->right_sample)); self->next_sample = tsk_malloc(num_samples * sizeof(*self->next_sample)); if (self->left_sample == NULL || self->right_sample == NULL || self->next_sample == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } } ret = tsk_tree_position_init(&self->tree_pos, tree_sequence, 0); if (ret != 0) { goto out; } ret = tsk_tree_clear(self); out: return ret; } int tsk_tree_set_root_threshold(tsk_tree_t *self, tsk_size_t root_threshold) { int ret = 0; if (root_threshold == 0) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } /* Don't allow the value to be set when the tree is out of the null * state */ if (self->index != -1) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_OPERATION); goto out; } self->root_threshold = root_threshold; /* Reset the roots */ ret = tsk_tree_clear(self); out: return ret; } tsk_size_t tsk_tree_get_root_threshold(const tsk_tree_t *self) { return self->root_threshold; } int tsk_tree_free(tsk_tree_t *self) { tsk_safe_free(self->parent); tsk_safe_free(self->left_child); tsk_safe_free(self->right_child); tsk_safe_free(self->left_sib); tsk_safe_free(self->right_sib); tsk_safe_free(self->num_samples); tsk_safe_free(self->num_tracked_samples); tsk_safe_free(self->left_sample); tsk_safe_free(self->right_sample); tsk_safe_free(self->next_sample); tsk_safe_free(self->num_children); tsk_safe_free(self->edge); tsk_tree_position_free(&self->tree_pos); return 0; } bool tsk_tree_has_sample_lists(const tsk_tree_t *self) { return !!(self->options & TSK_SAMPLE_LISTS); } bool tsk_tree_has_sample_counts(const tsk_tree_t *self) { return !(self->options & TSK_NO_SAMPLE_COUNTS); } static int TSK_WARN_UNUSED tsk_tree_reset_tracked_samples(tsk_tree_t *self) { int ret = 0; if (!tsk_tree_has_sample_counts(self)) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_OPERATION); goto out; } tsk_memset(self->num_tracked_samples, 0, (self->num_nodes + 1) * sizeof(*self->num_tracked_samples)); out: return ret; } int TSK_WARN_UNUSED tsk_tree_set_tracked_samples( tsk_tree_t *self, tsk_size_t num_tracked_samples, const tsk_id_t *tracked_samples) { int ret = TSK_ERR_GENERIC; tsk_size_t *tree_num_tracked_samples = self->num_tracked_samples; const tsk_id_t *parent = self->parent; tsk_size_t j; tsk_id_t u; /* TODO This is not needed when the tree is new. We should use the * state machine to check and only reset the tracked samples when needed. */ ret = tsk_tree_reset_tracked_samples(self); if (ret != 0) { goto out; } self->num_tracked_samples[self->virtual_root] = num_tracked_samples; for (j = 0; j < num_tracked_samples; j++) { u = tracked_samples[j]; if (u < 0 || u >= (tsk_id_t) self->num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } if (!tsk_treeseq_is_sample(self->tree_sequence, u)) { ret = tsk_trace_error(TSK_ERR_BAD_SAMPLES); goto out; } if (self->num_tracked_samples[u] != 0) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } /* Propagate this upwards */ while (u != TSK_NULL) { tree_num_tracked_samples[u]++; u = parent[u]; } } out: return ret; } int TSK_WARN_UNUSED tsk_tree_track_descendant_samples(tsk_tree_t *self, tsk_id_t node) { int ret = 0; tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); const tsk_id_t *restrict parent = self->parent; const tsk_id_t *restrict left_child = self->left_child; const tsk_id_t *restrict right_sib = self->right_sib; const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; tsk_size_t *num_tracked_samples = self->num_tracked_samples; tsk_size_t n, j, num_nodes; tsk_id_t u, v; if (nodes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_tree_postorder_from(self, node, nodes, &num_nodes); if (ret != 0) { goto out; } ret = tsk_tree_reset_tracked_samples(self); if (ret != 0) { goto out; } u = 0; /* keep the compiler happy */ for (j = 0; j < num_nodes; j++) { u = nodes[j]; for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { num_tracked_samples[u] += num_tracked_samples[v]; } num_tracked_samples[u] += flags[u] & TSK_NODE_IS_SAMPLE ? 1 : 0; } n = num_tracked_samples[u]; u = parent[u]; while (u != TSK_NULL) { num_tracked_samples[u] = n; u = parent[u]; } num_tracked_samples[self->virtual_root] = n; out: tsk_safe_free(nodes); return ret; } int TSK_WARN_UNUSED tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options) { int ret = TSK_ERR_GENERIC; tsk_size_t N = self->num_nodes + 1; if (!(options & TSK_NO_INIT)) { ret = tsk_tree_init(dest, self->tree_sequence, options); if (ret != 0) { goto out; } } if (self->tree_sequence != dest->tree_sequence) { ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); goto out; } dest->interval = self->interval; dest->left_index = self->left_index; dest->right_index = self->right_index; dest->direction = self->direction; dest->index = self->index; dest->sites = self->sites; dest->sites_length = self->sites_length; dest->root_threshold = self->root_threshold; dest->num_edges = self->num_edges; dest->tree_pos = self->tree_pos; tsk_memcpy(dest->parent, self->parent, N * sizeof(*self->parent)); tsk_memcpy(dest->left_child, self->left_child, N * sizeof(*self->left_child)); tsk_memcpy(dest->right_child, self->right_child, N * sizeof(*self->right_child)); tsk_memcpy(dest->left_sib, self->left_sib, N * sizeof(*self->left_sib)); tsk_memcpy(dest->right_sib, self->right_sib, N * sizeof(*self->right_sib)); tsk_memcpy(dest->num_children, self->num_children, N * sizeof(*self->num_children)); tsk_memcpy(dest->edge, self->edge, N * sizeof(*self->edge)); if (!(dest->options & TSK_NO_SAMPLE_COUNTS)) { if (self->options & TSK_NO_SAMPLE_COUNTS) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_OPERATION); goto out; } tsk_memcpy(dest->num_samples, self->num_samples, N * sizeof(*self->num_samples)); tsk_memcpy(dest->num_tracked_samples, self->num_tracked_samples, N * sizeof(*self->num_tracked_samples)); } if (dest->options & TSK_SAMPLE_LISTS) { if (!(self->options & TSK_SAMPLE_LISTS)) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_OPERATION); goto out; } tsk_memcpy(dest->left_sample, self->left_sample, N * sizeof(*self->left_sample)); tsk_memcpy( dest->right_sample, self->right_sample, N * sizeof(*self->right_sample)); tsk_memcpy(dest->next_sample, self->next_sample, self->tree_sequence->num_samples * sizeof(*self->next_sample)); } ret = 0; out: return ret; } bool TSK_WARN_UNUSED tsk_tree_equals(const tsk_tree_t *self, const tsk_tree_t *other) { bool ret = false; if (self->tree_sequence == other->tree_sequence) { ret = self->index == other->index; } return ret; } static int tsk_tree_check_node(const tsk_tree_t *self, tsk_id_t u) { int ret = 0; if (u < 0 || u > (tsk_id_t) self->num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); } return ret; } bool tsk_tree_is_descendant(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v) { bool ret = false; tsk_id_t w = u; tsk_id_t *restrict parent = self->parent; if (tsk_tree_check_node(self, u) == 0 && tsk_tree_check_node(self, v) == 0) { while (w != v && w != TSK_NULL) { w = parent[w]; } ret = w == v; } return ret; } int TSK_WARN_UNUSED tsk_tree_get_mrca(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v, tsk_id_t *mrca) { int ret = 0; double tu, tv; const tsk_id_t *restrict parent = self->parent; const double *restrict time = self->tree_sequence->tables->nodes.time; ret = tsk_tree_check_node(self, u); if (ret != 0) { goto out; } ret = tsk_tree_check_node(self, v); if (ret != 0) { goto out; } /* Simplest to make the virtual_root a special case here to avoid * doing the time lookup. */ if (u == self->virtual_root || v == self->virtual_root) { *mrca = self->virtual_root; return 0; } tu = time[u]; tv = time[v]; while (u != v) { if (tu < tv) { u = parent[u]; if (u == TSK_NULL) { break; } tu = time[u]; } else { v = parent[v]; if (v == TSK_NULL) { break; } tv = time[v]; } } *mrca = u == v ? u : TSK_NULL; out: return ret; } static int tsk_tree_get_num_samples_by_traversal( const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_samples) { int ret = 0; tsk_size_t num_nodes, j; tsk_size_t count = 0; const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); tsk_id_t v; if (nodes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_tree_preorder_from(self, u, nodes, &num_nodes); if (ret != 0) { goto out; } for (j = 0; j < num_nodes; j++) { v = nodes[j]; if (flags[v] & TSK_NODE_IS_SAMPLE) { count++; } } *num_samples = count; out: tsk_safe_free(nodes); return ret; } int TSK_WARN_UNUSED tsk_tree_get_num_samples(const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_samples) { int ret = 0; ret = tsk_tree_check_node(self, u); if (ret != 0) { goto out; } if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { *num_samples = (tsk_size_t) self->num_samples[u]; } else { ret = tsk_tree_get_num_samples_by_traversal(self, u, num_samples); } out: return ret; } int TSK_WARN_UNUSED tsk_tree_get_num_tracked_samples( const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_tracked_samples) { int ret = 0; ret = tsk_tree_check_node(self, u); if (ret != 0) { goto out; } if (self->options & TSK_NO_SAMPLE_COUNTS) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_OPERATION); goto out; } *num_tracked_samples = self->num_tracked_samples[u]; out: return ret; } bool tsk_tree_is_sample(const tsk_tree_t *self, tsk_id_t u) { return tsk_treeseq_is_sample(self->tree_sequence, u); } tsk_id_t tsk_tree_get_left_root(const tsk_tree_t *self) { return self->left_child[self->virtual_root]; } tsk_id_t tsk_tree_get_right_root(const tsk_tree_t *self) { return self->right_child[self->virtual_root]; } tsk_size_t tsk_tree_get_num_roots(const tsk_tree_t *self) { return (tsk_size_t) self->num_children[self->virtual_root]; } int TSK_WARN_UNUSED tsk_tree_get_parent(const tsk_tree_t *self, tsk_id_t u, tsk_id_t *parent) { int ret = 0; ret = tsk_tree_check_node(self, u); if (ret != 0) { goto out; } *parent = self->parent[u]; out: return ret; } int TSK_WARN_UNUSED tsk_tree_get_time(const tsk_tree_t *self, tsk_id_t u, double *t) { int ret = 0; tsk_node_t node; if (u == self->virtual_root) { *t = INFINITY; } else { ret = tsk_treeseq_get_node(self->tree_sequence, u, &node); if (ret != 0) { goto out; } *t = node.time; } out: return ret; } static inline double tsk_tree_get_branch_length_unsafe(const tsk_tree_t *self, tsk_id_t u) { const double *times = self->tree_sequence->tables->nodes.time; const tsk_id_t parent = self->parent[u]; return parent == TSK_NULL ? 0 : times[parent] - times[u]; } int TSK_WARN_UNUSED tsk_tree_get_branch_length(const tsk_tree_t *self, tsk_id_t u, double *ret_branch_length) { int ret = 0; ret = tsk_tree_check_node(self, u); if (ret != 0) { goto out; } *ret_branch_length = tsk_tree_get_branch_length_unsafe(self, u); out: return ret; } int tsk_tree_get_total_branch_length(const tsk_tree_t *self, tsk_id_t node, double *ret_tbl) { int ret = 0; tsk_size_t j, num_nodes; tsk_id_t u, v; const tsk_id_t *restrict parent = self->parent; const double *restrict time = self->tree_sequence->tables->nodes.time; tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); double sum = 0; if (nodes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_tree_preorder_from(self, node, nodes, &num_nodes); if (ret != 0) { goto out; } /* We always skip the first node because we don't return the branch length * over the input node. */ for (j = 1; j < num_nodes; j++) { u = nodes[j]; v = parent[u]; if (v != TSK_NULL) { sum += time[v] - time[u]; } } *ret_tbl = sum; out: tsk_safe_free(nodes); return ret; } int TSK_WARN_UNUSED tsk_tree_get_sites( const tsk_tree_t *self, const tsk_site_t **sites, tsk_size_t *sites_length) { *sites = self->sites; *sites_length = self->sites_length; return 0; } /* u must be a valid node in the tree. For internal use */ static int tsk_tree_get_depth_unsafe(const tsk_tree_t *self, tsk_id_t u) { tsk_id_t v; const tsk_id_t *restrict parent = self->parent; int depth = 0; if (u == self->virtual_root) { return -1; } for (v = parent[u]; v != TSK_NULL; v = parent[v]) { depth++; } return depth; } int TSK_WARN_UNUSED tsk_tree_get_depth(const tsk_tree_t *self, tsk_id_t u, int *depth_ret) { int ret = 0; ret = tsk_tree_check_node(self, u); if (ret != 0) { goto out; } *depth_ret = tsk_tree_get_depth_unsafe(self, u); out: return ret; } static tsk_id_t tsk_tree_node_root(tsk_tree_t *self, tsk_id_t u) { tsk_id_t v = u; while (self->parent[v] != TSK_NULL) { v = self->parent[v]; } return v; } static void tsk_tree_check_state(const tsk_tree_t *self) { tsk_id_t u, v; tsk_size_t j, num_samples; int err, c; tsk_site_t site; tsk_id_t *children = tsk_malloc(self->num_nodes * sizeof(tsk_id_t)); bool *is_root = tsk_calloc(self->num_nodes, sizeof(bool)); tsk_bug_assert(children != NULL); /* Check the virtual root properties */ tsk_bug_assert(self->parent[self->virtual_root] == TSK_NULL); tsk_bug_assert(self->left_sib[self->virtual_root] == TSK_NULL); tsk_bug_assert(self->right_sib[self->virtual_root] == TSK_NULL); for (j = 0; j < self->tree_sequence->num_samples; j++) { u = self->samples[j]; while (self->parent[u] != TSK_NULL) { u = self->parent[u]; } is_root[u] = true; } if (self->tree_sequence->num_samples == 0) { tsk_bug_assert(self->left_child[self->virtual_root] == TSK_NULL); } /* Iterate over the roots and make sure they are set */ for (u = tsk_tree_get_left_root(self); u != TSK_NULL; u = self->right_sib[u]) { tsk_bug_assert(is_root[u]); is_root[u] = false; } for (u = 0; u < (tsk_id_t) self->num_nodes; u++) { tsk_bug_assert(!is_root[u]); c = 0; for (v = self->left_child[u]; v != TSK_NULL; v = self->right_sib[v]) { tsk_bug_assert(self->parent[v] == u); children[c] = v; c++; } for (v = self->right_child[u]; v != TSK_NULL; v = self->left_sib[v]) { tsk_bug_assert(c > 0); c--; tsk_bug_assert(v == children[c]); } } for (j = 0; j < self->sites_length; j++) { site = self->sites[j]; tsk_bug_assert(self->interval.left <= site.position); tsk_bug_assert(site.position < self->interval.right); } if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { tsk_bug_assert(self->num_samples != NULL); tsk_bug_assert(self->num_tracked_samples != NULL); for (u = 0; u < (tsk_id_t) self->num_nodes; u++) { err = tsk_tree_get_num_samples_by_traversal(self, u, &num_samples); tsk_bug_assert(err == 0); tsk_bug_assert(num_samples == (tsk_size_t) self->num_samples[u]); } } else { tsk_bug_assert(self->num_samples == NULL); tsk_bug_assert(self->num_tracked_samples == NULL); } if (self->options & TSK_SAMPLE_LISTS) { tsk_bug_assert(self->right_sample != NULL); tsk_bug_assert(self->left_sample != NULL); tsk_bug_assert(self->next_sample != NULL); } else { tsk_bug_assert(self->right_sample == NULL); tsk_bug_assert(self->left_sample == NULL); tsk_bug_assert(self->next_sample == NULL); } free(children); free(is_root); } void tsk_tree_print_state(const tsk_tree_t *self, FILE *out) { tsk_size_t j; tsk_site_t site; fprintf(out, "Tree state:\n"); fprintf(out, "options = %d\n", self->options); fprintf(out, "root_threshold = %lld\n", (long long) self->root_threshold); fprintf(out, "left = %f\n", self->interval.left); fprintf(out, "right = %f\n", self->interval.right); fprintf(out, "index = %lld\n", (long long) self->index); fprintf(out, "num_edges = %d\n", (int) self->num_edges); fprintf(out, "node\tedge\tparent\tlchild\trchild\tlsib\trsib"); if (self->options & TSK_SAMPLE_LISTS) { fprintf(out, "\thead\ttail"); } fprintf(out, "\n"); for (j = 0; j < self->num_nodes + 1; j++) { fprintf(out, "%lld\t%lld\t%lld\t%lld\t%lld\t%lld\t%lld", (long long) j, (long long) self->edge[j], (long long) self->parent[j], (long long) self->left_child[j], (long long) self->right_child[j], (long long) self->left_sib[j], (long long) self->right_sib[j]); if (self->options & TSK_SAMPLE_LISTS) { fprintf(out, "\t%lld\t%lld\t", (long long) self->left_sample[j], (long long) self->right_sample[j]); } if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { fprintf(out, "\t%lld\t%lld", (long long) self->num_samples[j], (long long) self->num_tracked_samples[j]); } fprintf(out, "\n"); } fprintf(out, "sites = \n"); for (j = 0; j < self->sites_length; j++) { site = self->sites[j]; fprintf(out, "\t%lld\t%f\n", (long long) site.id, site.position); } tsk_tree_check_state(self); } /* Methods for positioning the tree along the sequence */ /* The following methods are performance sensitive and so we use a * lot of restrict pointers. Because we are saying that we don't have * any aliases to these pointers, we pass around the reference to parent * since it's used in all the functions. */ static inline void tsk_tree_update_sample_lists( tsk_tree_t *self, tsk_id_t node, const tsk_id_t *restrict parent) { tsk_id_t u, v, sample_index; tsk_id_t *restrict left_child = self->left_child; tsk_id_t *restrict right_sib = self->right_sib; tsk_id_t *restrict left = self->left_sample; tsk_id_t *restrict right = self->right_sample; tsk_id_t *restrict next = self->next_sample; const tsk_id_t *restrict sample_index_map = self->tree_sequence->sample_index_map; for (u = node; u != TSK_NULL; u = parent[u]) { sample_index = sample_index_map[u]; if (sample_index != TSK_NULL) { right[u] = left[u]; } else { left[u] = TSK_NULL; right[u] = TSK_NULL; } for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { if (left[v] != TSK_NULL) { tsk_bug_assert(right[v] != TSK_NULL); if (left[u] == TSK_NULL) { left[u] = left[v]; right[u] = right[v]; } else { next[right[u]] = left[v]; right[u] = right[v]; } } } } } static inline void tsk_tree_remove_branch( tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t *restrict parent) { tsk_id_t *restrict left_child = self->left_child; tsk_id_t *restrict right_child = self->right_child; tsk_id_t *restrict left_sib = self->left_sib; tsk_id_t *restrict right_sib = self->right_sib; tsk_id_t *restrict num_children = self->num_children; tsk_id_t lsib = left_sib[c]; tsk_id_t rsib = right_sib[c]; if (lsib == TSK_NULL) { left_child[p] = rsib; } else { right_sib[lsib] = rsib; } if (rsib == TSK_NULL) { right_child[p] = lsib; } else { left_sib[rsib] = lsib; } parent[c] = TSK_NULL; left_sib[c] = TSK_NULL; right_sib[c] = TSK_NULL; num_children[p]--; } static inline void tsk_tree_insert_branch( tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t *restrict parent) { tsk_id_t *restrict left_child = self->left_child; tsk_id_t *restrict right_child = self->right_child; tsk_id_t *restrict left_sib = self->left_sib; tsk_id_t *restrict right_sib = self->right_sib; tsk_id_t *restrict num_children = self->num_children; tsk_id_t u; parent[c] = p; u = right_child[p]; if (u == TSK_NULL) { left_child[p] = c; left_sib[c] = TSK_NULL; right_sib[c] = TSK_NULL; } else { right_sib[u] = c; left_sib[c] = u; right_sib[c] = TSK_NULL; } right_child[p] = c; num_children[p]++; } static inline void tsk_tree_insert_root(tsk_tree_t *self, tsk_id_t root, tsk_id_t *restrict parent) { tsk_tree_insert_branch(self, self->virtual_root, root, parent); parent[root] = TSK_NULL; } static inline void tsk_tree_remove_root(tsk_tree_t *self, tsk_id_t root, tsk_id_t *restrict parent) { tsk_tree_remove_branch(self, self->virtual_root, root, parent); } static void tsk_tree_remove_edge( tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t TSK_UNUSED(edge_id)) { tsk_id_t *restrict parent = self->parent; tsk_size_t *restrict num_samples = self->num_samples; tsk_size_t *restrict num_tracked_samples = self->num_tracked_samples; tsk_id_t *restrict edge = self->edge; const tsk_size_t root_threshold = self->root_threshold; tsk_id_t u; tsk_id_t path_end = TSK_NULL; bool path_end_was_root = false; #define POTENTIAL_ROOT(U) (num_samples[U] >= root_threshold) tsk_tree_remove_branch(self, p, c, parent); self->num_edges--; edge[c] = TSK_NULL; if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { u = p; while (u != TSK_NULL) { path_end = u; path_end_was_root = POTENTIAL_ROOT(u); num_samples[u] -= num_samples[c]; num_tracked_samples[u] -= num_tracked_samples[c]; u = parent[u]; } if (path_end_was_root && !POTENTIAL_ROOT(path_end)) { tsk_tree_remove_root(self, path_end, parent); } if (POTENTIAL_ROOT(c)) { tsk_tree_insert_root(self, c, parent); } } if (self->options & TSK_SAMPLE_LISTS) { tsk_tree_update_sample_lists(self, p, parent); } } static void tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t edge_id) { tsk_id_t *restrict parent = self->parent; tsk_size_t *restrict num_samples = self->num_samples; tsk_size_t *restrict num_tracked_samples = self->num_tracked_samples; tsk_id_t *restrict edge = self->edge; const tsk_size_t root_threshold = self->root_threshold; tsk_id_t u; tsk_id_t path_end = TSK_NULL; bool path_end_was_root = false; #define POTENTIAL_ROOT(U) (num_samples[U] >= root_threshold) if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { u = p; while (u != TSK_NULL) { path_end = u; path_end_was_root = POTENTIAL_ROOT(u); num_samples[u] += num_samples[c]; num_tracked_samples[u] += num_tracked_samples[c]; u = parent[u]; } if (POTENTIAL_ROOT(c)) { tsk_tree_remove_root(self, c, parent); } if (POTENTIAL_ROOT(path_end) && !path_end_was_root) { tsk_tree_insert_root(self, path_end, parent); } } tsk_tree_insert_branch(self, p, c, parent); self->num_edges++; edge[c] = edge_id; if (self->options & TSK_SAMPLE_LISTS) { tsk_tree_update_sample_lists(self, p, parent); } } int TSK_WARN_UNUSED tsk_tree_first(tsk_tree_t *self) { int ret = TSK_TREE_OK; ret = tsk_tree_clear(self); if (ret != 0) { goto out; } ret = tsk_tree_next(self); out: return ret; } int TSK_WARN_UNUSED tsk_tree_last(tsk_tree_t *self) { int ret = TSK_TREE_OK; ret = tsk_tree_clear(self); if (ret != 0) { goto out; } ret = tsk_tree_prev(self); out: return ret; } static void tsk_tree_update_index_and_interval(tsk_tree_t *self) { tsk_table_collection_t *tables = self->tree_sequence->tables; self->index = self->tree_pos.index; self->interval.left = self->tree_pos.interval.left; self->interval.right = self->tree_pos.interval.right; if (tables->sites.num_rows > 0) { self->sites = self->tree_sequence->tree_sites[self->index]; self->sites_length = self->tree_sequence->tree_sites_length[self->index]; } } int TSK_WARN_UNUSED tsk_tree_next(tsk_tree_t *self) { int ret = 0; tsk_table_collection_t *tables = self->tree_sequence->tables; const tsk_id_t *restrict edge_parent = tables->edges.parent; const tsk_id_t *restrict edge_child = tables->edges.child; tsk_id_t j, e; tsk_tree_position_t tree_pos; bool valid; valid = tsk_tree_position_next(&self->tree_pos); tree_pos = self->tree_pos; if (valid) { for (j = tree_pos.out.start; j != tree_pos.out.stop; j++) { e = tree_pos.out.order[j]; tsk_tree_remove_edge(self, edge_parent[e], edge_child[e], e); } for (j = tree_pos.in.start; j != tree_pos.in.stop; j++) { e = tree_pos.in.order[j]; tsk_tree_insert_edge(self, edge_parent[e], edge_child[e], e); } ret = TSK_TREE_OK; tsk_tree_update_index_and_interval(self); } else { ret = tsk_tree_clear(self); } return ret; } int TSK_WARN_UNUSED tsk_tree_prev(tsk_tree_t *self) { int ret = 0; tsk_table_collection_t *tables = self->tree_sequence->tables; const tsk_id_t *restrict edge_parent = tables->edges.parent; const tsk_id_t *restrict edge_child = tables->edges.child; tsk_id_t j, e; tsk_tree_position_t tree_pos; bool valid; valid = tsk_tree_position_prev(&self->tree_pos); tree_pos = self->tree_pos; if (valid) { for (j = tree_pos.out.start; j != tree_pos.out.stop; j--) { e = tree_pos.out.order[j]; tsk_tree_remove_edge(self, edge_parent[e], edge_child[e], e); } for (j = tree_pos.in.start; j != tree_pos.in.stop; j--) { e = tree_pos.in.order[j]; tsk_tree_insert_edge(self, edge_parent[e], edge_child[e], e); } ret = TSK_TREE_OK; tsk_tree_update_index_and_interval(self); } else { ret = tsk_tree_clear(self); } return ret; } static inline bool tsk_tree_position_in_interval(const tsk_tree_t *self, double x) { return self->interval.left <= x && x < self->interval.right; } static int tsk_tree_seek_from_null(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_table_collection_t *tables = self->tree_sequence->tables; const tsk_id_t *restrict edge_parent = tables->edges.parent; const tsk_id_t *restrict edge_child = tables->edges.child; const double *restrict edge_left = tables->edges.left; const double *restrict edge_right = tables->edges.right; double interval_left, interval_right; const double *restrict breakpoints = self->tree_sequence->breakpoints; const tsk_size_t num_trees = self->tree_sequence->num_trees; const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); tsk_id_t j, e, index; tsk_tree_position_t tree_pos; index = (tsk_id_t) tsk_search_sorted(breakpoints, num_trees + 1, x); if (breakpoints[index] > x) { index--; } if (x <= L / 2.0) { ret = tsk_tree_position_seek_forward(&self->tree_pos, index); if (ret != 0) { goto out; } // Since we are seeking from null, there are no edges to remove tree_pos = self->tree_pos; interval_left = tree_pos.interval.left; for (j = tree_pos.in.start; j != tree_pos.in.stop; j++) { e = tree_pos.in.order[j]; if (edge_left[e] <= interval_left && interval_left < edge_right[e]) { tsk_tree_insert_edge(self, edge_parent[e], edge_child[e], e); } } } else { ret = tsk_tree_position_seek_backward(&self->tree_pos, index); if (ret != 0) { goto out; } tree_pos = self->tree_pos; interval_right = tree_pos.interval.right; for (j = tree_pos.in.start; j != tree_pos.in.stop; j--) { e = tree_pos.in.order[j]; if (edge_right[e] >= interval_right && interval_right > edge_left[e]) { tsk_tree_insert_edge(self, edge_parent[e], edge_child[e], e); } } } tsk_tree_update_index_and_interval(self); out: return ret; } static int TSK_WARN_UNUSED tsk_tree_seek_forward(tsk_tree_t *self, tsk_id_t index) { int ret = 0; tsk_table_collection_t *tables = self->tree_sequence->tables; const tsk_id_t *restrict edge_parent = tables->edges.parent; const tsk_id_t *restrict edge_child = tables->edges.child; const double *restrict edge_left = tables->edges.left; const double *restrict edge_right = tables->edges.right; double interval_left, e_left; const double old_right = self->interval.right; tsk_id_t j, e; tsk_tree_position_t tree_pos; ret = tsk_tree_position_seek_forward(&self->tree_pos, index); if (ret != 0) { goto out; } tree_pos = self->tree_pos; interval_left = tree_pos.interval.left; for (j = tree_pos.out.start; j != tree_pos.out.stop; j++) { e = tree_pos.out.order[j]; e_left = edge_left[e]; if (e_left < old_right) { tsk_bug_assert(edge_parent[e] != TSK_NULL); tsk_tree_remove_edge(self, edge_parent[e], edge_child[e], e); } tsk_bug_assert(e_left < interval_left); } for (j = tree_pos.in.start; j != tree_pos.in.stop; j++) { e = tree_pos.in.order[j]; if (edge_left[e] <= interval_left && interval_left < edge_right[e]) { tsk_tree_insert_edge(self, edge_parent[e], edge_child[e], e); } } tsk_tree_update_index_and_interval(self); out: return ret; } static int TSK_WARN_UNUSED tsk_tree_seek_backward(tsk_tree_t *self, tsk_id_t index) { int ret = 0; tsk_table_collection_t *tables = self->tree_sequence->tables; const tsk_id_t *restrict edge_parent = tables->edges.parent; const tsk_id_t *restrict edge_child = tables->edges.child; const double *restrict edge_left = tables->edges.left; const double *restrict edge_right = tables->edges.right; double interval_right, e_right; const double old_right = self->interval.right; tsk_id_t j, e; tsk_tree_position_t tree_pos; ret = tsk_tree_position_seek_backward(&self->tree_pos, index); if (ret != 0) { goto out; } tree_pos = self->tree_pos; interval_right = tree_pos.interval.right; for (j = tree_pos.out.start; j != tree_pos.out.stop; j--) { e = tree_pos.out.order[j]; e_right = edge_right[e]; if (e_right >= old_right) { tsk_bug_assert(edge_parent[e] != TSK_NULL); tsk_tree_remove_edge(self, edge_parent[e], edge_child[e], e); } tsk_bug_assert(e_right > interval_right); } for (j = tree_pos.in.start; j != tree_pos.in.stop; j--) { e = tree_pos.in.order[j]; if (edge_right[e] >= interval_right && interval_right > edge_left[e]) { tsk_tree_insert_edge(self, edge_parent[e], edge_child[e], e); } } tsk_tree_update_index_and_interval(self); out: return ret; } int TSK_WARN_UNUSED tsk_tree_seek_index(tsk_tree_t *self, tsk_id_t tree, tsk_flags_t options) { int ret = 0; double x; if (tree < 0 || tree >= (tsk_id_t) self->tree_sequence->num_trees) { ret = tsk_trace_error(TSK_ERR_SEEK_OUT_OF_BOUNDS); goto out; } x = self->tree_sequence->breakpoints[tree]; ret = tsk_tree_seek(self, x, options); out: return ret; } static int TSK_WARN_UNUSED tsk_tree_seek_linear(tsk_tree_t *self, double x) { const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); const double t_l = self->interval.left; const double t_r = self->interval.right; int ret = 0; double distance_left, distance_right; if (x < t_l) { /* |-----|-----|========|---------| */ /* 0 x t_l t_r L */ distance_left = t_l - x; distance_right = L - t_r + x; } else { /* |------|========|------|-------| */ /* 0 t_l t_r x L */ distance_right = x - t_r; distance_left = t_l + L - x; } if (distance_right <= distance_left) { while (!tsk_tree_position_in_interval(self, x)) { ret = tsk_tree_next(self); if (ret < 0) { goto out; } } } else { while (!tsk_tree_position_in_interval(self, x)) { ret = tsk_tree_prev(self); if (ret < 0) { goto out; } } } ret = 0; out: return ret; } static int TSK_WARN_UNUSED tsk_tree_seek_skip(tsk_tree_t *self, double x) { const double t_l = self->interval.left; int ret = 0; tsk_id_t index; const tsk_size_t num_trees = self->tree_sequence->num_trees; const double *restrict breakpoints = self->tree_sequence->breakpoints; index = (tsk_id_t) tsk_search_sorted(breakpoints, num_trees + 1, x); if (breakpoints[index] > x) { index--; } if (x < t_l) { ret = tsk_tree_seek_backward(self, index); } else { ret = tsk_tree_seek_forward(self, index); } tsk_bug_assert(tsk_tree_position_in_interval(self, x)); return ret; } int TSK_WARN_UNUSED tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t options) { int ret = 0; const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); if (x < 0 || x >= L) { ret = tsk_trace_error(TSK_ERR_SEEK_OUT_OF_BOUNDS); goto out; } if (self->index == -1) { ret = tsk_tree_seek_from_null(self, x, options); } else { if (options & TSK_SEEK_SKIP) { ret = tsk_tree_seek_skip(self, x); } else { ret = tsk_tree_seek_linear(self, x); } } out: return ret; } int TSK_WARN_UNUSED tsk_tree_clear(tsk_tree_t *self) { int ret = 0; tsk_size_t j; tsk_id_t u; const tsk_size_t N = self->num_nodes + 1; const tsk_size_t num_samples = self->tree_sequence->num_samples; const bool sample_counts = !(self->options & TSK_NO_SAMPLE_COUNTS); const bool sample_lists = !!(self->options & TSK_SAMPLE_LISTS); const tsk_flags_t *flags = self->tree_sequence->tables->nodes.flags; self->interval.left = 0; self->interval.right = 0; self->num_edges = 0; self->index = -1; tsk_tree_position_set_null(&self->tree_pos); /* TODO we should profile this method to see if just doing a single loop over * the nodes would be more efficient than multiple memsets. */ tsk_memset(self->parent, 0xff, N * sizeof(*self->parent)); tsk_memset(self->left_child, 0xff, N * sizeof(*self->left_child)); tsk_memset(self->right_child, 0xff, N * sizeof(*self->right_child)); tsk_memset(self->left_sib, 0xff, N * sizeof(*self->left_sib)); tsk_memset(self->right_sib, 0xff, N * sizeof(*self->right_sib)); tsk_memset(self->num_children, 0, N * sizeof(*self->num_children)); tsk_memset(self->edge, 0xff, N * sizeof(*self->edge)); if (sample_counts) { tsk_memset(self->num_samples, 0, N * sizeof(*self->num_samples)); /* We can't reset the tracked samples via memset because we don't * know where the tracked samples are. */ for (j = 0; j < self->num_nodes; j++) { if (!(flags[j] & TSK_NODE_IS_SAMPLE)) { self->num_tracked_samples[j] = 0; } } /* The total tracked_samples gets set in set_tracked_samples */ self->num_samples[self->virtual_root] = num_samples; } if (sample_lists) { tsk_memset(self->left_sample, 0xff, N * sizeof(tsk_id_t)); tsk_memset(self->right_sample, 0xff, N * sizeof(tsk_id_t)); tsk_memset(self->next_sample, 0xff, num_samples * sizeof(tsk_id_t)); } /* Set the sample attributes */ for (j = 0; j < num_samples; j++) { u = self->samples[j]; if (sample_counts) { self->num_samples[u] = 1; } if (sample_lists) { /* We are mapping to *indexes* into the list of samples here */ self->left_sample[u] = (tsk_id_t) j; self->right_sample[u] = (tsk_id_t) j; } } if (sample_counts && self->root_threshold == 1 && num_samples > 0) { for (j = 0; j < num_samples; j++) { /* Set initial roots */ if (self->root_threshold == 1) { tsk_tree_insert_root(self, self->samples[j], self->parent); } } } return ret; } tsk_size_t tsk_tree_get_size_bound(const tsk_tree_t *self) { tsk_size_t bound = 0; if (self->tree_sequence != NULL) { /* This is a safe upper bound which can be computed cheaply. * We have at most n roots and each edge adds at most one new * node to the tree. We also allow space for the virtual root, * to simplify client code. * * In the common case of a binary tree with a single root, we have * 2n - 1 nodes in total, and 2n - 2 edges. Therefore, we return * 3n - 1, which is an over-estimate of 1/2 and we allocate * 1.5 times as much memory as we need. * * Since tracking the exact number of nodes in the tree would require * storing the number of nodes beneath every node and complicate * the tree transition method, this seems like a good compromise * and will result in less memory usage overall in nearly all cases. */ bound = 1 + self->tree_sequence->num_samples + self->num_edges; } return bound; } /* Traversal orders */ static tsk_id_t * tsk_tree_alloc_node_stack(const tsk_tree_t *self) { return tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(tsk_id_t)); } int tsk_tree_preorder(const tsk_tree_t *self, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) { return tsk_tree_preorder_from(self, -1, nodes, num_nodes_ret); } int tsk_tree_preorder_from( const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) { int ret = 0; const tsk_id_t *restrict right_child = self->right_child; const tsk_id_t *restrict left_sib = self->left_sib; tsk_id_t *stack = tsk_tree_alloc_node_stack(self); tsk_size_t num_nodes = 0; tsk_id_t u, v; int stack_top; if (stack == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if ((root == -1 || root == self->virtual_root) && !tsk_tree_has_sample_counts(self)) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_OPERATION); goto out; } if (root == -1) { stack_top = -1; for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { stack_top++; stack[stack_top] = u; } } else { ret = tsk_tree_check_node(self, root); if (ret != 0) { goto out; } stack_top = 0; stack[stack_top] = root; } while (stack_top >= 0) { u = stack[stack_top]; stack_top--; nodes[num_nodes] = u; num_nodes++; for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { stack_top++; stack[stack_top] = v; } } *num_nodes_ret = num_nodes; out: tsk_safe_free(stack); return ret; } /* We could implement this using the preorder function, but since it's * going to be performance critical we want to avoid the overhead * of mallocing the intermediate node list (which will be bigger than * the number of samples). */ int tsk_tree_preorder_samples_from( const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) { int ret = 0; const tsk_id_t *restrict right_child = self->right_child; const tsk_id_t *restrict left_sib = self->left_sib; const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; tsk_id_t *stack = tsk_tree_alloc_node_stack(self); tsk_size_t num_nodes = 0; tsk_id_t u, v; int stack_top; if (stack == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } /* We could push the virtual_root onto the stack directly to simplify * the code a little, but then we'd have to check put an extra check * when looking up the flags array (which isn't defined for virtual_root). */ if (root == -1 || root == self->virtual_root) { if (!tsk_tree_has_sample_counts(self)) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_OPERATION); goto out; } stack_top = -1; for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { stack_top++; stack[stack_top] = u; } } else { ret = tsk_tree_check_node(self, root); if (ret != 0) { goto out; } stack_top = 0; stack[stack_top] = root; } while (stack_top >= 0) { u = stack[stack_top]; stack_top--; if (flags[u] & TSK_NODE_IS_SAMPLE) { nodes[num_nodes] = u; num_nodes++; } for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { stack_top++; stack[stack_top] = v; } } *num_nodes_ret = num_nodes; out: tsk_safe_free(stack); return ret; } int tsk_tree_postorder(const tsk_tree_t *self, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) { return tsk_tree_postorder_from(self, -1, nodes, num_nodes_ret); } int tsk_tree_postorder_from( const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) { int ret = 0; const tsk_id_t *restrict right_child = self->right_child; const tsk_id_t *restrict left_sib = self->left_sib; const tsk_id_t *restrict parent = self->parent; tsk_id_t *stack = tsk_tree_alloc_node_stack(self); tsk_size_t num_nodes = 0; tsk_id_t u, v, postorder_parent; int stack_top; bool is_virtual_root = root == self->virtual_root; if (stack == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (root == -1 || is_virtual_root) { if (!tsk_tree_has_sample_counts(self)) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_OPERATION); goto out; } stack_top = -1; for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { stack_top++; stack[stack_top] = u; } } else { ret = tsk_tree_check_node(self, root); if (ret != 0) { goto out; } stack_top = 0; stack[stack_top] = root; } postorder_parent = TSK_NULL; while (stack_top >= 0) { u = stack[stack_top]; if (right_child[u] != TSK_NULL && u != postorder_parent) { for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { stack_top++; stack[stack_top] = v; } } else { stack_top--; postorder_parent = parent[u]; nodes[num_nodes] = u; num_nodes++; } } if (is_virtual_root) { nodes[num_nodes] = root; num_nodes++; } *num_nodes_ret = num_nodes; out: tsk_safe_free(stack); return ret; } /* Balance/imbalance metrics */ /* Result is a tsk_size_t value here because we could imagine the total * depth overflowing a 32bit integer for a large tree. */ int tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result) { /* Keep the size of the stack elements to 8 bytes in total in the * standard case. A tsk_id_t depth value is always safe, since * depth counts the number of nodes encountered on a path. */ struct stack_elem { tsk_id_t node; tsk_id_t depth; }; int ret = 0; const tsk_id_t *restrict right_child = self->right_child; const tsk_id_t *restrict left_sib = self->left_sib; struct stack_elem *stack = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*stack)); int stack_top; tsk_size_t total_depth; tsk_id_t u; struct stack_elem s = { .node = TSK_NULL, .depth = 0 }; if (stack == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } stack_top = -1; for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { stack_top++; s.node = u; stack[stack_top] = s; } total_depth = 0; while (stack_top >= 0) { s = stack[stack_top]; stack_top--; u = right_child[s.node]; if (u == TSK_NULL) { total_depth += (tsk_size_t) s.depth; } else { s.depth++; while (u != TSK_NULL) { stack_top++; s.node = u; stack[stack_top] = s; u = left_sib[u]; } } } *result = total_depth; out: tsk_safe_free(stack); return ret; } int tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result) { int ret = 0; const tsk_id_t *restrict right_child = self->right_child; const tsk_id_t *restrict left_sib = self->left_sib; tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); tsk_id_t *num_leaves = tsk_calloc(self->num_nodes, sizeof(*num_leaves)); tsk_size_t j, num_nodes, total; tsk_id_t num_children, u, v; if (nodes == NULL || num_leaves == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (tsk_tree_get_num_roots(self) != 1) { ret = tsk_trace_error(TSK_ERR_UNDEFINED_MULTIROOT); goto out; } ret = tsk_tree_postorder(self, nodes, &num_nodes); if (ret != 0) { goto out; } total = 0; for (j = 0; j < num_nodes; j++) { u = nodes[j]; /* Cheaper to compute this on the fly than to access the num_children array. * since we're already iterating over the children. */ num_children = 0; for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { num_children++; num_leaves[u] += num_leaves[v]; } if (num_children == 0) { num_leaves[u] = 1; } else if (num_children == 2) { v = right_child[u]; total += (tsk_size_t) llabs(num_leaves[v] - num_leaves[left_sib[v]]); } else { ret = tsk_trace_error(TSK_ERR_UNDEFINED_NONBINARY); goto out; } } *result = total; out: tsk_safe_free(nodes); tsk_safe_free(num_leaves); return ret; } int tsk_tree_b1_index(const tsk_tree_t *self, double *result) { int ret = 0; const tsk_id_t *restrict parent = self->parent; const tsk_id_t *restrict right_child = self->right_child; const tsk_id_t *restrict left_sib = self->left_sib; tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); tsk_size_t *max_path_length = tsk_calloc(self->num_nodes, sizeof(*max_path_length)); tsk_size_t j, num_nodes, mpl; double total = 0.0; tsk_id_t u, v; if (nodes == NULL || max_path_length == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = tsk_tree_postorder(self, nodes, &num_nodes); if (ret != 0) { goto out; } for (j = 0; j < num_nodes; j++) { u = nodes[j]; if (parent[u] != TSK_NULL && right_child[u] != TSK_NULL) { mpl = 0; for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { mpl = TSK_MAX(mpl, max_path_length[v]); } max_path_length[u] = mpl + 1; total += 1 / (double) max_path_length[u]; } } *result = total; out: tsk_safe_free(nodes); tsk_safe_free(max_path_length); return ret; } static double general_log(double x, double base) { return log(x) / log(base); } int tsk_tree_b2_index(const tsk_tree_t *self, double base, double *result) { struct stack_elem { tsk_id_t node; double path_product; }; int ret = 0; const tsk_id_t *restrict right_child = self->right_child; const tsk_id_t *restrict left_sib = self->left_sib; struct stack_elem *stack = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*stack)); int stack_top; double total_proba = 0; double num_children; tsk_id_t u; struct stack_elem s = { .node = TSK_NULL, .path_product = 1 }; if (stack == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (tsk_tree_get_num_roots(self) != 1) { ret = tsk_trace_error(TSK_ERR_UNDEFINED_MULTIROOT); goto out; } stack_top = 0; s.node = tsk_tree_get_left_root(self); stack[stack_top] = s; while (stack_top >= 0) { s = stack[stack_top]; stack_top--; u = right_child[s.node]; if (u == TSK_NULL) { total_proba -= s.path_product * general_log(s.path_product, base); } else { num_children = 0; for (; u != TSK_NULL; u = left_sib[u]) { num_children++; } s.path_product *= 1 / num_children; for (u = right_child[s.node]; u != TSK_NULL; u = left_sib[u]) { stack_top++; s.node = u; stack[stack_top] = s; } } } *result = total_proba; out: tsk_safe_free(stack); return ret; } int tsk_tree_num_lineages(const tsk_tree_t *self, double t, tsk_size_t *result) { int ret = 0; const tsk_id_t *restrict right_child = self->right_child; const tsk_id_t *restrict left_sib = self->left_sib; const double *restrict time = self->tree_sequence->tables->nodes.time; tsk_id_t *stack = tsk_tree_alloc_node_stack(self); tsk_size_t num_lineages = 0; int stack_top; tsk_id_t u, v; double child_time, parent_time; if (stack == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (!tsk_isfinite(t)) { ret = tsk_trace_error(TSK_ERR_TIME_NONFINITE); goto out; } /* Push the roots onto the stack */ stack_top = -1; for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { stack_top++; stack[stack_top] = u; } while (stack_top >= 0) { u = stack[stack_top]; parent_time = time[u]; stack_top--; for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { child_time = time[v]; /* Only traverse down the tree as far as we need to */ if (child_time > t) { stack_top++; stack[stack_top] = v; } else if (t < parent_time) { num_lineages++; } } } *result = num_lineages; out: tsk_safe_free(stack); return ret; } /* Parsimony methods */ static inline uint64_t set_bit(uint64_t value, int32_t bit) { return value | (1ULL << bit); } static inline bool bit_is_set(uint64_t value, int32_t bit) { return (value & (1ULL << bit)) != 0; } static inline int8_t get_smallest_set_bit(uint64_t v) { /* This is an inefficient implementation, there are several better * approaches. On GCC we can use * return (uint8_t) (__builtin_ffsll((long long) v) - 1); */ uint64_t t = 1; int8_t r = 0; assert(v != 0); while ((v & t) == 0) { t <<= 1; r++; } return r; } #define HARTIGAN_MAX_ALLELES 64 /* This interface is experimental. In the future, we should provide the option to * use a general cost matrix, in which case we'll use the Sankoff algorithm. For * now this is unused. * * We should also vectorise the function so that several sites can be processed * at once. * * The algorithm used here is Hartigan parsimony, "Minimum Mutation Fits to a * Given Tree", Biometrics 1973. */ int TSK_WARN_UNUSED tsk_tree_map_mutations(tsk_tree_t *self, int32_t *genotypes, double *TSK_UNUSED(cost_matrix), tsk_flags_t options, int32_t *r_ancestral_state, tsk_size_t *r_num_transitions, tsk_state_transition_t **r_transitions) { int ret = 0; struct stack_elem { tsk_id_t node; tsk_id_t transition_parent; int32_t state; }; const tsk_size_t num_samples = self->tree_sequence->num_samples; const tsk_id_t *restrict left_child = self->left_child; const tsk_id_t *restrict right_sib = self->right_sib; const tsk_size_t N = tsk_treeseq_get_num_nodes(self->tree_sequence); const tsk_flags_t *restrict node_flags = self->tree_sequence->tables->nodes.flags; tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); /* Note: to use less memory here and to improve cache performance we should * probably change to allocating exactly the number of nodes returned by * a preorder traversal, and then lay the memory out in this order. So, we'd * need a map from node ID to its index in the preorder traversal, but this * is trivial to compute. Probably doesn't matter so much at the moment * when we're doing a single site, but it would make a big difference if * we were vectorising over lots of sites. */ uint64_t *restrict optimal_set = tsk_calloc(N + 1, sizeof(*optimal_set)); struct stack_elem *restrict preorder_stack = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*preorder_stack)); tsk_id_t u, v; /* The largest possible number of transitions is one over every sample */ tsk_state_transition_t *transitions = tsk_malloc(num_samples * sizeof(*transitions)); int32_t allele, ancestral_state; int stack_top; struct stack_elem s; tsk_size_t j, num_transitions, max_allele_count, num_nodes; tsk_size_t allele_count[HARTIGAN_MAX_ALLELES]; tsk_size_t non_missing = 0; int32_t num_alleles = 0; if (optimal_set == NULL || preorder_stack == NULL || transitions == NULL || nodes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < num_samples; j++) { if (genotypes[j] >= HARTIGAN_MAX_ALLELES || genotypes[j] < TSK_MISSING_DATA) { ret = tsk_trace_error(TSK_ERR_BAD_GENOTYPE); goto out; } u = self->tree_sequence->samples[j]; if (genotypes[j] == TSK_MISSING_DATA) { /* All bits set */ optimal_set[u] = UINT64_MAX; } else { optimal_set[u] = set_bit(optimal_set[u], genotypes[j]); num_alleles = TSK_MAX(genotypes[j], num_alleles); non_missing++; } } if (non_missing == 0) { ret = tsk_trace_error(TSK_ERR_GENOTYPES_ALL_MISSING); goto out; } num_alleles++; ancestral_state = 0; /* keep compiler happy */ if (options & TSK_MM_FIXED_ANCESTRAL_STATE) { ancestral_state = *r_ancestral_state; if ((ancestral_state < 0) || (ancestral_state >= HARTIGAN_MAX_ALLELES)) { ret = tsk_trace_error(TSK_ERR_BAD_ANCESTRAL_STATE); goto out; } else if (ancestral_state >= num_alleles) { num_alleles = (int32_t) (ancestral_state + 1); } } ret = tsk_tree_postorder_from(self, self->virtual_root, nodes, &num_nodes); if (ret != 0) { goto out; } for (j = 0; j < num_nodes; j++) { u = nodes[j]; tsk_memset(allele_count, 0, ((size_t) num_alleles) * sizeof(*allele_count)); for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { for (allele = 0; allele < num_alleles; allele++) { allele_count[allele] += bit_is_set(optimal_set[v], allele); } } /* the virtual root has no flags defined */ if (u == (tsk_id_t) N || !(node_flags[u] & TSK_NODE_IS_SAMPLE)) { max_allele_count = 0; for (allele = 0; allele < num_alleles; allele++) { max_allele_count = TSK_MAX(max_allele_count, allele_count[allele]); } for (allele = 0; allele < num_alleles; allele++) { if (allele_count[allele] == max_allele_count) { optimal_set[u] = set_bit(optimal_set[u], allele); } } } } if (!(options & TSK_MM_FIXED_ANCESTRAL_STATE)) { ancestral_state = get_smallest_set_bit(optimal_set[self->virtual_root]); } else { optimal_set[self->virtual_root] = UINT64_MAX; } num_transitions = 0; /* Do a preorder traversal */ preorder_stack[0].node = self->virtual_root; preorder_stack[0].state = ancestral_state; preorder_stack[0].transition_parent = TSK_NULL; stack_top = 0; while (stack_top >= 0) { s = preorder_stack[stack_top]; stack_top--; if (!bit_is_set(optimal_set[s.node], s.state)) { s.state = get_smallest_set_bit(optimal_set[s.node]); transitions[num_transitions].node = s.node; transitions[num_transitions].parent = s.transition_parent; transitions[num_transitions].state = s.state; s.transition_parent = (tsk_id_t) num_transitions; num_transitions++; } for (v = left_child[s.node]; v != TSK_NULL; v = right_sib[v]) { stack_top++; s.node = v; preorder_stack[stack_top] = s; } } *r_transitions = transitions; *r_num_transitions = num_transitions; *r_ancestral_state = ancestral_state; transitions = NULL; out: tsk_safe_free(transitions); /* Cannot safe_free because of 'restrict' */ if (optimal_set != NULL) { free(optimal_set); } if (preorder_stack != NULL) { free(preorder_stack); } if (nodes != NULL) { free(nodes); } return ret; } /* ======================================================== * * KC Distance * ======================================================== */ typedef struct { tsk_size_t *m; double *M; tsk_id_t n; tsk_id_t N; } kc_vectors; static int kc_vectors_alloc(kc_vectors *self, tsk_id_t n) { int ret = 0; self->n = n; self->N = (n * (n - 1)) / 2; self->m = tsk_calloc((size_t) (self->N + self->n), sizeof(*self->m)); self->M = tsk_calloc((size_t) (self->N + self->n), sizeof(*self->M)); if (self->m == NULL || self->M == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } out: return ret; } static void kc_vectors_free(kc_vectors *self) { tsk_safe_free(self->m); tsk_safe_free(self->M); } static inline void update_kc_vectors_single_sample( const tsk_treeseq_t *ts, kc_vectors *kc_vecs, tsk_id_t u, double time) { const tsk_id_t *sample_index_map = ts->sample_index_map; tsk_id_t u_index = sample_index_map[u]; kc_vecs->m[kc_vecs->N + u_index] = 1; kc_vecs->M[kc_vecs->N + u_index] = time; } static inline void update_kc_vectors_all_pairs(const tsk_tree_t *tree, kc_vectors *kc_vecs, tsk_id_t u, tsk_id_t v, tsk_size_t depth, double time) { tsk_id_t sample1_index, sample2_index, n1, n2, tmp, pair_index; const tsk_id_t *restrict left_sample = tree->left_sample; const tsk_id_t *restrict right_sample = tree->right_sample; const tsk_id_t *restrict next_sample = tree->next_sample; tsk_size_t *restrict kc_m = kc_vecs->m; double *restrict kc_M = kc_vecs->M; sample1_index = left_sample[u]; while (sample1_index != TSK_NULL) { sample2_index = left_sample[v]; while (sample2_index != TSK_NULL) { n1 = sample1_index; n2 = sample2_index; if (n1 > n2) { tmp = n1; n1 = n2; n2 = tmp; } /* We spend ~40% of our time here because these accesses * are not in order and gets very poor cache behavior */ pair_index = n2 - n1 - 1 + (-1 * n1 * (n1 - 2 * kc_vecs->n + 1)) / 2; kc_m[pair_index] = depth; kc_M[pair_index] = time; if (sample2_index == right_sample[v]) { break; } sample2_index = next_sample[sample2_index]; } if (sample1_index == right_sample[u]) { break; } sample1_index = next_sample[sample1_index]; } } struct kc_stack_elmt { tsk_id_t node; tsk_size_t depth; }; static int fill_kc_vectors(const tsk_tree_t *t, kc_vectors *kc_vecs) { int stack_top; tsk_size_t depth; double time; const double *times; struct kc_stack_elmt *stack; tsk_id_t root, u, c1, c2; int ret = 0; const tsk_treeseq_t *ts = t->tree_sequence; stack = tsk_malloc(tsk_tree_get_size_bound(t) * sizeof(*stack)); if (stack == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } times = t->tree_sequence->tables->nodes.time; for (root = tsk_tree_get_left_root(t); root != TSK_NULL; root = t->right_sib[root]) { stack_top = 0; stack[stack_top].node = root; stack[stack_top].depth = 0; while (stack_top >= 0) { u = stack[stack_top].node; depth = stack[stack_top].depth; stack_top--; if (tsk_tree_is_sample(t, u)) { time = tsk_tree_get_branch_length_unsafe(t, u); update_kc_vectors_single_sample(ts, kc_vecs, u, time); } /* Don't bother going deeper if there are no samples under this node */ if (t->left_sample[u] != TSK_NULL) { for (c1 = t->left_child[u]; c1 != TSK_NULL; c1 = t->right_sib[c1]) { stack_top++; stack[stack_top].node = c1; stack[stack_top].depth = depth + 1; for (c2 = t->right_sib[c1]; c2 != TSK_NULL; c2 = t->right_sib[c2]) { time = times[root] - times[u]; update_kc_vectors_all_pairs(t, kc_vecs, c1, c2, depth, time); } } } } } out: tsk_safe_free(stack); return ret; } static double norm_kc_vectors(kc_vectors *self, kc_vectors *other, double lambda) { double vT1, vT2, distance_sum; tsk_id_t i; distance_sum = 0; for (i = 0; i < self->n + self->N; i++) { vT1 = ((double) self->m[i] * (1 - lambda)) + (lambda * self->M[i]); vT2 = ((double) other->m[i] * (1 - lambda)) + (lambda * other->M[i]); distance_sum += (vT1 - vT2) * (vT1 - vT2); } return sqrt(distance_sum); } static int check_kc_distance_tree_inputs(const tsk_tree_t *self) { tsk_id_t u, num_nodes, left_child; int ret = 0; if (tsk_tree_get_num_roots(self) != 1) { ret = tsk_trace_error(TSK_ERR_MULTIPLE_ROOTS); goto out; } if (!tsk_tree_has_sample_lists(self)) { ret = tsk_trace_error(TSK_ERR_NO_SAMPLE_LISTS); goto out; } num_nodes = (tsk_id_t) tsk_treeseq_get_num_nodes(self->tree_sequence); for (u = 0; u < num_nodes; u++) { left_child = self->left_child[u]; if (left_child != TSK_NULL && left_child == self->right_child[u]) { ret = tsk_trace_error(TSK_ERR_UNARY_NODES); goto out; } } out: return ret; } static int check_kc_distance_samples_inputs(const tsk_treeseq_t *self, const tsk_treeseq_t *other) { const tsk_id_t *samples, *other_samples; tsk_id_t i, n; int ret = 0; if (self->num_samples != other->num_samples) { ret = tsk_trace_error(TSK_ERR_SAMPLE_SIZE_MISMATCH); goto out; } samples = self->samples; other_samples = other->samples; n = (tsk_id_t) self->num_samples; for (i = 0; i < n; i++) { if (samples[i] != other_samples[i]) { ret = tsk_trace_error(TSK_ERR_SAMPLES_NOT_EQUAL); goto out; } } out: return ret; } int tsk_tree_kc_distance( const tsk_tree_t *self, const tsk_tree_t *other, double lambda, double *result) { tsk_id_t n, i; kc_vectors vecs[2]; const tsk_tree_t *trees[2] = { self, other }; int ret = 0; for (i = 0; i < 2; i++) { tsk_memset(&vecs[i], 0, sizeof(kc_vectors)); } ret = check_kc_distance_samples_inputs(self->tree_sequence, other->tree_sequence); if (ret != 0) { goto out; } for (i = 0; i < 2; i++) { ret = check_kc_distance_tree_inputs(trees[i]); if (ret != 0) { goto out; } } n = (tsk_id_t) self->tree_sequence->num_samples; for (i = 0; i < 2; i++) { ret = kc_vectors_alloc(&vecs[i], n); if (ret != 0) { goto out; } ret = fill_kc_vectors(trees[i], &vecs[i]); if (ret != 0) { goto out; } } *result = norm_kc_vectors(&vecs[0], &vecs[1], lambda); out: for (i = 0; i < 2; i++) { kc_vectors_free(&vecs[i]); } return ret; } static int check_kc_distance_tree_sequence_inputs( const tsk_treeseq_t *self, const tsk_treeseq_t *other) { int ret = 0; if (self->tables->sequence_length != other->tables->sequence_length) { ret = tsk_trace_error(TSK_ERR_SEQUENCE_LENGTH_MISMATCH); goto out; } ret = check_kc_distance_samples_inputs(self, other); if (ret != 0) { goto out; } out: return ret; } static void update_kc_pair_with_sample(const tsk_tree_t *self, kc_vectors *kc, tsk_id_t sample, tsk_size_t *depths, double root_time) { tsk_id_t c, p, sib; double time; tsk_size_t depth; double *times = self->tree_sequence->tables->nodes.time; c = sample; for (p = self->parent[sample]; p != TSK_NULL; p = self->parent[p]) { time = root_time - times[p]; depth = depths[p]; for (sib = self->left_child[p]; sib != TSK_NULL; sib = self->right_sib[sib]) { if (sib != c) { update_kc_vectors_all_pairs(self, kc, sample, sib, depth, time); } } c = p; } } static int update_kc_subtree_state( tsk_tree_t *t, kc_vectors *kc, tsk_id_t u, tsk_size_t *depths, double root_time) { int stack_top; tsk_id_t v, c; tsk_id_t *stack = NULL; int ret = 0; stack = tsk_malloc(tsk_tree_get_size_bound(t) * sizeof(*stack)); if (stack == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } stack_top = 0; stack[stack_top] = u; while (stack_top >= 0) { v = stack[stack_top]; stack_top--; if (tsk_tree_is_sample(t, v)) { update_kc_pair_with_sample(t, kc, v, depths, root_time); } for (c = t->left_child[v]; c != TSK_NULL; c = t->right_sib[c]) { if (depths[c] != 0) { depths[c] = depths[v] + 1; stack_top++; stack[stack_top] = c; } } } out: tsk_safe_free(stack); return ret; } static int update_kc_incremental(tsk_tree_t *tree, kc_vectors *kc, tsk_size_t *depths) { int ret = 0; tsk_id_t u, v, e, j; double root_time, time; const double *restrict times = tree->tree_sequence->tables->nodes.time; const tsk_id_t *restrict edges_child = tree->tree_sequence->tables->edges.child; const tsk_id_t *restrict edges_parent = tree->tree_sequence->tables->edges.parent; tsk_tree_position_t tree_pos = tree->tree_pos; /* Update state of detached subtrees */ for (j = tree_pos.out.stop - 1; j >= tree_pos.out.start; j--) { e = tree_pos.out.order[j]; u = edges_child[e]; depths[u] = 0; if (tree->parent[u] == TSK_NULL) { root_time = times[tsk_tree_node_root(tree, u)]; ret = update_kc_subtree_state(tree, kc, u, depths, root_time); if (ret != 0) { goto out; } } } /* Propagate state change down into reattached subtrees. */ for (j = tree_pos.in.stop - 1; j >= tree_pos.in.start; j--) { e = tree_pos.in.order[j]; u = edges_child[e]; v = edges_parent[e]; tsk_bug_assert(depths[u] == 0); depths[u] = depths[v] + 1; root_time = times[tsk_tree_node_root(tree, u)]; ret = update_kc_subtree_state(tree, kc, u, depths, root_time); if (ret != 0) { goto out; } if (tsk_tree_is_sample(tree, u)) { time = tsk_tree_get_branch_length_unsafe(tree, u); update_kc_vectors_single_sample(tree->tree_sequence, kc, u, time); } } out: return ret; } int tsk_treeseq_kc_distance(const tsk_treeseq_t *self, const tsk_treeseq_t *other, double lambda_, double *result) { int i; tsk_id_t n; tsk_size_t num_nodes; double left, span, total; const tsk_treeseq_t *treeseqs[2] = { self, other }; tsk_tree_t trees[2]; kc_vectors kcs[2]; tsk_size_t *depths[2]; int ret = 0; for (i = 0; i < 2; i++) { tsk_memset(&trees[i], 0, sizeof(trees[i])); tsk_memset(&kcs[i], 0, sizeof(kcs[i])); depths[i] = NULL; } ret = check_kc_distance_tree_sequence_inputs(self, other); if (ret != 0) { goto out; } n = (tsk_id_t) self->num_samples; for (i = 0; i < 2; i++) { ret = tsk_tree_init(&trees[i], treeseqs[i], TSK_SAMPLE_LISTS); if (ret != 0) { goto out; } ret = kc_vectors_alloc(&kcs[i], n); if (ret != 0) { goto out; } num_nodes = tsk_treeseq_get_num_nodes(treeseqs[i]); depths[i] = tsk_calloc(num_nodes, sizeof(*depths[i])); if (depths[i] == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } } total = 0; left = 0; ret = tsk_tree_first(&trees[0]); if (ret != TSK_TREE_OK) { goto out; } ret = check_kc_distance_tree_inputs(&trees[0]); if (ret != 0) { goto out; } ret = update_kc_incremental(&trees[0], &kcs[0], depths[0]); if (ret != 0) { goto out; } while ((ret = tsk_tree_next(&trees[1])) == TSK_TREE_OK) { ret = check_kc_distance_tree_inputs(&trees[1]); if (ret != 0) { goto out; } ret = update_kc_incremental(&trees[1], &kcs[1], depths[1]); if (ret != 0) { goto out; } while (trees[0].interval.right < trees[1].interval.right) { span = trees[0].interval.right - left; total += norm_kc_vectors(&kcs[0], &kcs[1], lambda_) * span; left = trees[0].interval.right; ret = tsk_tree_next(&trees[0]); tsk_bug_assert(ret == TSK_TREE_OK); ret = check_kc_distance_tree_inputs(&trees[0]); if (ret != 0) { goto out; } ret = update_kc_incremental(&trees[0], &kcs[0], depths[0]); if (ret != 0) { goto out; } } span = trees[1].interval.right - left; left = trees[1].interval.right; total += norm_kc_vectors(&kcs[0], &kcs[1], lambda_) * span; } if (ret != 0) { goto out; } *result = total / self->tables->sequence_length; out: for (i = 0; i < 2; i++) { tsk_tree_free(&trees[i]); kc_vectors_free(&kcs[i]); tsk_safe_free(depths[i]); } return ret; } /* * Divergence matrix */ typedef struct { /* Note it's a waste storing the triply linked tree here, but the code * is written on the assumption of 1-based trees and the algorithm is * frighteningly subtle, so it doesn't seem worth messing with it * unless we really need to save some memory */ tsk_id_t *parent; tsk_id_t *child; tsk_id_t *sib; tsk_id_t *lambda; tsk_id_t *pi; tsk_id_t *tau; tsk_id_t *beta; tsk_id_t *alpha; } sv_tables_t; static int sv_tables_init(sv_tables_t *self, tsk_size_t n) { int ret = 0; self->parent = tsk_malloc(n * sizeof(*self->parent)); self->child = tsk_malloc(n * sizeof(*self->child)); self->sib = tsk_malloc(n * sizeof(*self->sib)); self->pi = tsk_malloc(n * sizeof(*self->pi)); self->lambda = tsk_malloc(n * sizeof(*self->lambda)); self->tau = tsk_malloc(n * sizeof(*self->tau)); self->beta = tsk_malloc(n * sizeof(*self->beta)); self->alpha = tsk_malloc(n * sizeof(*self->alpha)); if (self->parent == NULL || self->child == NULL || self->sib == NULL || self->lambda == NULL || self->tau == NULL || self->beta == NULL || self->alpha == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } out: return ret; } static int sv_tables_free(sv_tables_t *self) { tsk_safe_free(self->parent); tsk_safe_free(self->child); tsk_safe_free(self->sib); tsk_safe_free(self->lambda); tsk_safe_free(self->pi); tsk_safe_free(self->tau); tsk_safe_free(self->beta); tsk_safe_free(self->alpha); return 0; } static void sv_tables_reset(sv_tables_t *self, tsk_tree_t *tree) { const tsk_size_t n = 1 + tree->num_nodes; tsk_memset(self->parent, 0, n * sizeof(*self->parent)); tsk_memset(self->child, 0, n * sizeof(*self->child)); tsk_memset(self->sib, 0, n * sizeof(*self->sib)); tsk_memset(self->pi, 0, n * sizeof(*self->pi)); tsk_memset(self->lambda, 0, n * sizeof(*self->lambda)); tsk_memset(self->tau, 0, n * sizeof(*self->tau)); tsk_memset(self->beta, 0, n * sizeof(*self->beta)); tsk_memset(self->alpha, 0, n * sizeof(*self->alpha)); } static void sv_tables_convert_tree(sv_tables_t *self, tsk_tree_t *tree) { const tsk_size_t n = 1 + tree->num_nodes; const tsk_id_t *restrict tsk_parent = tree->parent; tsk_id_t *restrict child = self->child; tsk_id_t *restrict parent = self->parent; tsk_id_t *restrict sib = self->sib; tsk_size_t j; tsk_id_t u, v; for (j = 0; j < n - 1; j++) { u = (tsk_id_t) j + 1; v = tsk_parent[j] + 1; sib[u] = child[v]; child[v] = u; parent[u] = v; } } #define LAMBDA 0 static void sv_tables_build_index(sv_tables_t *self) { const tsk_id_t *restrict child = self->child; const tsk_id_t *restrict parent = self->parent; const tsk_id_t *restrict sib = self->sib; tsk_id_t *restrict lambda = self->lambda; tsk_id_t *restrict pi = self->pi; tsk_id_t *restrict tau = self->tau; tsk_id_t *restrict beta = self->beta; tsk_id_t *restrict alpha = self->alpha; tsk_id_t a, n, p, h; p = child[LAMBDA]; n = 0; lambda[0] = -1; while (p != LAMBDA) { while (true) { n++; pi[p] = n; tau[n] = LAMBDA; lambda[n] = 1 + lambda[n >> 1]; if (child[p] != LAMBDA) { p = child[p]; } else { break; } } beta[p] = n; while (true) { tau[beta[p]] = parent[p]; if (sib[p] != LAMBDA) { p = sib[p]; break; } else { p = parent[p]; if (p != LAMBDA) { h = lambda[n & -pi[p]]; beta[p] = ((n >> h) | 1) << h; } else { break; } } } } /* Begin the second traversal */ lambda[0] = lambda[n]; pi[LAMBDA] = 0; beta[LAMBDA] = 0; alpha[LAMBDA] = 0; p = child[LAMBDA]; while (p != LAMBDA) { while (true) { a = alpha[parent[p]] | (beta[p] & -beta[p]); alpha[p] = a; if (child[p] != LAMBDA) { p = child[p]; } else { break; } } while (true) { if (sib[p] != LAMBDA) { p = sib[p]; break; } else { p = parent[p]; if (p == LAMBDA) { break; } } } } } static void sv_tables_build(sv_tables_t *self, tsk_tree_t *tree) { sv_tables_reset(self, tree); sv_tables_convert_tree(self, tree); sv_tables_build_index(self); } static tsk_id_t sv_tables_mrca_one_based(const sv_tables_t *self, tsk_id_t x, tsk_id_t y) { const tsk_id_t *restrict lambda = self->lambda; const tsk_id_t *restrict pi = self->pi; const tsk_id_t *restrict tau = self->tau; const tsk_id_t *restrict beta = self->beta; const tsk_id_t *restrict alpha = self->alpha; tsk_id_t h, k, xhat, yhat, ell, j, z; if (beta[x] <= beta[y]) { h = lambda[beta[y] & -beta[x]]; } else { h = lambda[beta[x] & -beta[y]]; } k = alpha[x] & alpha[y] & -(1 << h); h = lambda[k & -k]; j = ((beta[x] >> h) | 1) << h; if (j == beta[x]) { xhat = x; } else { ell = lambda[alpha[x] & ((1 << h) - 1)]; xhat = tau[((beta[x] >> ell) | 1) << ell]; } if (j == beta[y]) { yhat = y; } else { ell = lambda[alpha[y] & ((1 << h) - 1)]; yhat = tau[((beta[y] >> ell) | 1) << ell]; } if (pi[xhat] <= pi[yhat]) { z = xhat; } else { z = yhat; } return z; } static tsk_id_t sv_tables_mrca(const sv_tables_t *self, tsk_id_t x, tsk_id_t y) { /* Convert to 1-based indexes and back */ return sv_tables_mrca_one_based(self, x + 1, y + 1) - 1; } static int tsk_treeseq_divergence_matrix_branch(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *restrict sample_set_sizes, const tsk_id_t *restrict sample_sets, tsk_size_t num_windows, const double *restrict windows, tsk_flags_t options, double *restrict result) { int ret = 0; tsk_tree_t tree; const double *restrict nodes_time = self->tables->nodes.time; const tsk_size_t N = num_sample_sets; tsk_size_t i, j, k, offset, sj, sk; tsk_id_t u, v, w, u_root, v_root; double tu, tv, d, span, left, right, span_left, span_right; double *restrict D; sv_tables_t sv; tsk_size_t *ss_offsets = tsk_malloc((num_sample_sets + 1) * sizeof(*ss_offsets)); memset(&sv, 0, sizeof(sv)); ret = tsk_tree_init(&tree, self, 0); if (ret != 0) { goto out; } ret = sv_tables_init(&sv, self->tables->nodes.num_rows + 1); if (ret != 0) { goto out; } if (ss_offsets == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } if (self->time_uncalibrated && !(options & TSK_STAT_ALLOW_TIME_UNCALIBRATED)) { ret = tsk_trace_error(TSK_ERR_TIME_UNCALIBRATED); goto out; } ss_offsets[0] = 0; offset = 0; for (j = 0; j < N; j++) { offset += sample_set_sizes[j]; ss_offsets[j + 1] = offset; } for (i = 0; i < num_windows; i++) { left = windows[i]; right = windows[i + 1]; D = result + i * N * N; ret = tsk_tree_seek(&tree, left, 0); if (ret != 0) { goto out; } while (tree.interval.left < right && tree.index != -1) { span_left = TSK_MAX(tree.interval.left, left); span_right = TSK_MIN(tree.interval.right, right); span = span_right - span_left; sv_tables_build(&sv, &tree); for (sj = 0; sj < N; sj++) { for (j = ss_offsets[sj]; j < ss_offsets[sj + 1]; j++) { u = sample_sets[j]; for (sk = sj; sk < N; sk++) { for (k = ss_offsets[sk]; k < ss_offsets[sk + 1]; k++) { v = sample_sets[k]; if (u == v) { /* This case contributes zero to divergence, so * short-circuit to save time. * TODO is there a better way to do this? */ continue; } w = sv_tables_mrca(&sv, u, v); if (w != TSK_NULL) { u_root = w; v_root = w; } else { /* Slow path - only happens for nodes in disconnected * subtrees in a tree with multiple roots */ u_root = tsk_tree_get_node_root(&tree, u); v_root = tsk_tree_get_node_root(&tree, v); } tu = nodes_time[u_root] - nodes_time[u]; tv = nodes_time[v_root] - nodes_time[v]; d = (tu + tv) * span; D[sj * N + sk] += d; } } } } ret = tsk_tree_next(&tree); if (ret < 0) { goto out; } } } ret = 0; out: tsk_tree_free(&tree); sv_tables_free(&sv); tsk_safe_free(ss_offsets); return ret; } // FIXME see #2817 // Just including this here for now as it's the simplest option. Everything // will probably move to stats.[c,h] in the near future though, and it // can pull in ``genotypes.h`` without issues. #include static void update_site_divergence(const tsk_variant_t *var, const tsk_id_t *restrict A, const tsk_size_t *restrict offsets, const tsk_size_t num_sample_sets, double *D) { const tsk_size_t num_alleles = var->num_alleles; tsk_size_t a, b, j, k; tsk_id_t u, v; double increment; for (a = 0; a < num_alleles; a++) { for (b = a + 1; b < num_alleles; b++) { for (j = offsets[a]; j < offsets[a + 1]; j++) { for (k = offsets[b]; k < offsets[b + 1]; k++) { u = A[j]; v = A[k]; /* Only increment the upper triangle to (hopefully) improve memory * access patterns */ if (u > v) { u = A[k]; v = A[j]; } increment = 1; if (u == v) { increment = 2; } D[u * (tsk_id_t) num_sample_sets + v] += increment; } } } } } static void group_alleles(const tsk_variant_t *var, tsk_id_t *restrict A, tsk_size_t *offsets) { const tsk_size_t n = var->num_samples; const int32_t *restrict genotypes = var->genotypes; tsk_id_t a; tsk_size_t j, k; k = 0; offsets[0] = 0; for (a = 0; a < (tsk_id_t) var->num_alleles; a++) { offsets[a + 1] = offsets[a]; for (j = 0; j < n; j++) { if (genotypes[j] == a) { offsets[a + 1]++; A[k] = (tsk_id_t) j; k++; } } } } static void remap_to_sample_sets(const tsk_size_t num_samples, const tsk_id_t *restrict samples, const tsk_id_t *restrict sample_set_index_map, tsk_id_t *restrict A) { tsk_size_t j; tsk_id_t u; for (j = 0; j < num_samples; j++) { u = samples[A[j]]; tsk_bug_assert(u >= 0); tsk_bug_assert(sample_set_index_map[u] >= 0); A[j] = sample_set_index_map[u]; } } static int tsk_treeseq_divergence_matrix_site(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_id_t *restrict sample_set_index_map, const tsk_size_t num_samples, const tsk_id_t *restrict samples, tsk_size_t num_windows, const double *restrict windows, tsk_flags_t TSK_UNUSED(options), double *restrict result) { int ret = 0; tsk_size_t i; tsk_id_t site_id; double left, right; double *restrict D; const tsk_id_t num_sites = (tsk_id_t) self->tables->sites.num_rows; const double *restrict sites_position = self->tables->sites.position; tsk_id_t *A = tsk_malloc(num_samples * sizeof(*A)); /* Allocate the allele offsets at the first variant */ tsk_size_t max_alleles = 0; tsk_size_t *allele_offsets = NULL; tsk_variant_t variant; /* FIXME it's not clear that using TSK_ISOLATED_NOT_MISSING is * correct here */ ret = tsk_variant_init( &variant, self, samples, num_samples, NULL, TSK_ISOLATED_NOT_MISSING); if (ret != 0) { goto out; } if (A == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } site_id = 0; while (site_id < num_sites && sites_position[site_id] < windows[0]) { site_id++; } for (i = 0; i < num_windows; i++) { left = windows[i]; right = windows[i + 1]; D = result + i * num_sample_sets * num_sample_sets; if (site_id < num_sites) { tsk_bug_assert(sites_position[site_id] >= left); } while (site_id < num_sites && sites_position[site_id] < right) { ret = tsk_variant_decode(&variant, site_id, 0); if (ret != 0) { goto out; } if (variant.num_alleles > max_alleles) { /* could do some kind of doubling here, but there's no * point - just keep it simple for testing. */ max_alleles = variant.num_alleles; tsk_safe_free(allele_offsets); allele_offsets = tsk_malloc((max_alleles + 1) * sizeof(*allele_offsets)); if (allele_offsets == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } } group_alleles(&variant, A, allele_offsets); remap_to_sample_sets(num_samples, samples, sample_set_index_map, A); update_site_divergence(&variant, A, allele_offsets, num_sample_sets, D); site_id++; } } ret = 0; out: tsk_variant_free(&variant); tsk_safe_free(A); tsk_safe_free(allele_offsets); return ret; } /* Return the mapping from node IDs to the index of the sample set * they belong to, or -1 of none. Error if a node is in more than one * set. */ static int get_sample_set_index_map(const tsk_treeseq_t *self, const tsk_size_t num_sample_sets, const tsk_size_t *restrict sample_set_sizes, const tsk_id_t *restrict sample_sets, tsk_size_t *ret_total_samples, tsk_id_t *restrict node_index_map) { int ret = 0; tsk_size_t i, j, k; tsk_id_t u; tsk_size_t total_samples = 0; const tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_flags_t *restrict node_flags = self->tables->nodes.flags; for (j = 0; j < num_nodes; j++) { node_index_map[j] = TSK_NULL; } i = 0; for (j = 0; j < num_sample_sets; j++) { total_samples += sample_set_sizes[j]; for (k = 0; k < sample_set_sizes[j]; k++) { u = sample_sets[i]; i++; if (u < 0 || u >= (tsk_id_t) num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } /* Note: we require nodes to be samples because we have to think * about how to normalise by the length of genome that the node * is 'in' the tree for each window otherwise. */ if (!(node_flags[u] & TSK_NODE_IS_SAMPLE)) { ret = tsk_trace_error(TSK_ERR_BAD_SAMPLES); goto out; } if (node_index_map[u] != TSK_NULL) { ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } node_index_map[u] = (tsk_id_t) j; } } *ret_total_samples = total_samples; out: return ret; } static void fill_lower_triangle_count_normalise(const tsk_size_t num_windows, const tsk_size_t n, const tsk_size_t *set_sizes, double *restrict result) { tsk_size_t i, j, k; double denom; double *restrict D; /* TODO there's probably a better striding pattern that could be used here */ for (i = 0; i < num_windows; i++) { D = result + i * n * n; for (j = 0; j < n; j++) { denom = (double) set_sizes[j] * (double) (set_sizes[j] - 1); if (denom != 0) { D[j * n + j] /= denom; } for (k = j + 1; k < n; k++) { denom = (double) set_sizes[j] * (double) set_sizes[k]; D[j * n + k] /= denom; D[k * n + j] = D[j * n + k]; } } } } int tsk_treeseq_divergence_matrix(const tsk_treeseq_t *self, tsk_size_t num_sample_sets_in, const tsk_size_t *sample_set_sizes_in, const tsk_id_t *sample_sets_in, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result) { int ret = 0; tsk_size_t N, total_samples; const tsk_size_t *sample_set_sizes; const tsk_id_t *sample_sets; tsk_size_t *tmp_sample_set_sizes = NULL; const double default_windows[] = { 0, self->tables->sequence_length }; const tsk_size_t num_nodes = self->tables->nodes.num_rows; bool stat_site = !!(options & TSK_STAT_SITE); bool stat_branch = !!(options & TSK_STAT_BRANCH); bool stat_node = !!(options & TSK_STAT_NODE); tsk_id_t *sample_set_index_map = tsk_malloc(num_nodes * sizeof(*sample_set_index_map)); tsk_size_t j; if (stat_node) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_STAT_MODE); goto out; } /* If no mode is specified, we default to site mode */ if (!(stat_site || stat_branch)) { stat_site = true; } /* It's an error to specify more than one mode */ if (stat_site + stat_branch > 1) { ret = tsk_trace_error(TSK_ERR_MULTIPLE_STAT_MODES); goto out; } if (options & TSK_STAT_POLARISED) { ret = tsk_trace_error(TSK_ERR_STAT_POLARISED_UNSUPPORTED); goto out; } if (windows == NULL) { num_windows = 1; windows = default_windows; } else { ret = tsk_treeseq_check_windows(self, num_windows, windows, 0); if (ret != 0) { goto out; } } /* If sample_sets is NULL, use self->samples and ignore input * num_sample_sets */ sample_sets = sample_sets_in; N = num_sample_sets_in; if (sample_sets_in == NULL) { sample_sets = self->samples; if (sample_set_sizes_in == NULL) { N = self->num_samples; } } sample_set_sizes = sample_set_sizes_in; /* If sample_set_sizes is NULL, assume its N 1S */ if (sample_set_sizes_in == NULL) { tmp_sample_set_sizes = tsk_malloc(N * sizeof(*tmp_sample_set_sizes)); if (tmp_sample_set_sizes == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < N; j++) { tmp_sample_set_sizes[j] = 1; } sample_set_sizes = tmp_sample_set_sizes; } ret = get_sample_set_index_map( self, N, sample_set_sizes, sample_sets, &total_samples, sample_set_index_map); if (ret != 0) { goto out; } tsk_memset(result, 0, num_windows * N * N * sizeof(*result)); if (stat_branch) { ret = tsk_treeseq_divergence_matrix_branch(self, N, sample_set_sizes, sample_sets, num_windows, windows, options, result); } else { tsk_bug_assert(stat_site); ret = tsk_treeseq_divergence_matrix_site(self, N, sample_set_index_map, total_samples, sample_sets, num_windows, windows, options, result); } if (ret != 0) { goto out; } fill_lower_triangle_count_normalise(num_windows, N, sample_set_sizes, result); if (options & TSK_STAT_SPAN_NORMALISE) { span_normalise(num_windows, windows, N * N, result); } out: tsk_safe_free(sample_set_index_map); tsk_safe_free(tmp_sample_set_sizes); return ret; } /* ======================================================== * * Extend haplotypes * ======================================================== */ typedef struct _edge_list_t { tsk_id_t edge; // the `extended` flags records whether we have decided to extend // this entry to the current tree? int extended; struct _edge_list_t *next; } edge_list_t; static void edge_list_print(edge_list_t **head, tsk_edge_table_t *edges, FILE *out) { int n = 0; edge_list_t *px; fprintf(out, "Edge list:\n"); for (px = *head; px != NULL; px = px->next) { fprintf(out, " %d: %d (%d); ", n, (int) px->edge, px->extended); if (px->edge >= 0 && edges != NULL) { fprintf(out, "%d->%d on [%.1f, %.1f)", (int) edges->child[px->edge], (int) edges->parent[px->edge], edges->left[px->edge], edges->right[px->edge]); } else { fprintf(out, "(null)"); } fprintf(out, "\n"); n += 1; } fprintf(out, "length = %d\n", n); } static void edge_list_append_entry( edge_list_t **head, edge_list_t **tail, edge_list_t *x, tsk_id_t edge, int extended) { x->edge = edge; x->extended = extended; x->next = NULL; if (*tail == NULL) { *head = x; } else { (*tail)->next = x; } *tail = x; } static void remove_unextended(edge_list_t **head, edge_list_t **tail) { edge_list_t *px, *x; px = *head; while (px != NULL && px->extended == 0) { px = px->next; } *head = px; if (px != NULL) { px->extended = 0; x = px->next; while (x != NULL) { if (x->extended > 0) { x->extended = 0; px->next = x; px = x; } x = x->next; } px->next = NULL; } *tail = px; } static void edge_list_set_extended(edge_list_t **head, tsk_id_t edge_id) { // finds the entry with edge 'edge_id' // and sets its 'extended' flag to 1 edge_list_t *px; px = *head; tsk_bug_assert(px != NULL); while (px->edge != edge_id) { px = px->next; tsk_bug_assert(px != NULL); } tsk_bug_assert(px->edge == edge_id); px->extended = 1; } static int tsk_treeseq_slide_mutation_nodes_up( const tsk_treeseq_t *self, tsk_mutation_table_t *mutations) { int ret = 0; double t; tsk_id_t c, p, next_mut; const tsk_size_t num_nodes = self->tables->nodes.num_rows; const double *sites_position = self->tables->sites.position; const double *nodes_time = self->tables->nodes.time; tsk_tree_t tree; ret = tsk_tree_init(&tree, self, TSK_NO_SAMPLE_COUNTS); if (ret != 0) { goto out; } next_mut = 0; for (ret = tsk_tree_first(&tree); ret == TSK_TREE_OK; ret = tsk_tree_next(&tree)) { while (next_mut < (tsk_id_t) mutations->num_rows && sites_position[mutations->site[next_mut]] < tree.interval.right) { t = mutations->time[next_mut]; if (tsk_is_unknown_time(t)) { ret = tsk_trace_error(TSK_ERR_DISALLOWED_UNKNOWN_MUTATION_TIME); goto out; } c = mutations->node[next_mut]; tsk_bug_assert(c < (tsk_id_t) num_nodes); p = tree.parent[c]; while (p != TSK_NULL && nodes_time[p] <= t) { c = p; p = tree.parent[c]; } tsk_bug_assert(nodes_time[c] <= t); mutations->node[next_mut] = c; next_mut++; } } if (ret != 0) { goto out; } out: tsk_tree_free(&tree); return ret; } typedef struct { const tsk_treeseq_t *ts; tsk_edge_table_t *edges; int direction; tsk_id_t *last_degree, *next_degree; tsk_id_t *last_nodes_edge, *next_nodes_edge; tsk_id_t *parent_out, *parent_in; bool *not_sample; double *near_side, *far_side; edge_list_t *edges_out_head, *edges_out_tail; edge_list_t *edges_in_head, *edges_in_tail; tsk_blkalloc_t edge_list_heap; } haplotype_extender_t; static int haplotype_extender_init(haplotype_extender_t *self, const tsk_treeseq_t *ts, int direction, tsk_edge_table_t *edges) { int ret = 0; tsk_id_t tj; tsk_size_t num_nodes = tsk_treeseq_get_num_nodes(ts); tsk_memset(self, 0, sizeof(haplotype_extender_t)); self->ts = ts; self->edges = edges; ret = tsk_edge_table_copy(&ts->tables->edges, self->edges, TSK_NO_INIT); if (ret != 0) { goto out; } self->direction = direction; if (direction == TSK_DIR_FORWARD) { self->near_side = self->edges->left; self->far_side = self->edges->right; } else { self->near_side = self->edges->right; self->far_side = self->edges->left; } self->edges_in_head = NULL; self->edges_in_tail = NULL; self->edges_out_head = NULL; self->edges_out_tail = NULL; ret = tsk_blkalloc_init(&self->edge_list_heap, 8192); if (ret != 0) { goto out; } self->last_degree = tsk_calloc(num_nodes, sizeof(*self->last_degree)); self->next_degree = tsk_calloc(num_nodes, sizeof(*self->next_degree)); self->last_nodes_edge = tsk_malloc(num_nodes * sizeof(*self->last_nodes_edge)); self->next_nodes_edge = tsk_malloc(num_nodes * sizeof(*self->next_nodes_edge)); self->parent_out = tsk_malloc(num_nodes * sizeof(*self->parent_out)); self->parent_in = tsk_malloc(num_nodes * sizeof(*self->parent_in)); self->not_sample = tsk_malloc(num_nodes * sizeof(*self->not_sample)); if (self->last_degree == NULL || self->next_degree == NULL || self->last_nodes_edge == NULL || self->next_nodes_edge == NULL || self->parent_out == NULL || self->parent_in == NULL || self->not_sample == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(self->last_nodes_edge, 0xff, num_nodes * sizeof(*self->last_nodes_edge)); tsk_memset(self->next_nodes_edge, 0xff, num_nodes * sizeof(*self->next_nodes_edge)); tsk_memset(self->parent_out, 0xff, num_nodes * sizeof(*self->parent_out)); tsk_memset(self->parent_in, 0xff, num_nodes * sizeof(*self->parent_in)); for (tj = 0; tj < (tsk_id_t) num_nodes; tj++) { self->not_sample[tj] = ((ts->tables->nodes.flags[tj] & TSK_NODE_IS_SAMPLE) == 0); } out: return ret; } static void haplotype_extender_print_state(haplotype_extender_t *self, FILE *out) { fprintf(out, "\n======= haplotype extender ===========\n"); fprintf(out, "parent in:\n"); for (int j = 0; j < (int) self->ts->tables->nodes.num_rows; j++) { fprintf(out, " %d: %d\n", j, (int) self->parent_in[j]); } fprintf(out, "parent out:\n"); for (int j = 0; j < (int) self->ts->tables->nodes.num_rows; j++) { fprintf(out, " %d: %d\n", j, (int) self->parent_out[j]); } fprintf(out, "last nodes edge:\n"); for (int j = 0; j < (int) self->ts->tables->nodes.num_rows; j++) { tsk_id_t ej = self->last_nodes_edge[j]; fprintf(out, " %d: %d, ", j, (int) ej); if (self->last_nodes_edge[j] != TSK_NULL) { fprintf(out, "(%d->%d, %.1f-%.1f)", (int) self->edges->child[ej], (int) self->edges->parent[ej], self->edges->left[ej], self->edges->right[ej]); } else { fprintf(out, "(null);"); } fprintf(out, "\n"); } fprintf(out, "next nodes edge:\n"); for (int j = 0; j < (int) self->ts->tables->nodes.num_rows; j++) { tsk_id_t ej = self->next_nodes_edge[j]; fprintf(out, " %d: %d, ", j, (int) ej); if (self->next_nodes_edge[j] != TSK_NULL) { fprintf(out, "(%d->%d, %.1f-%.1f)", (int) self->edges->child[ej], (int) self->edges->parent[ej], self->edges->left[ej], self->edges->right[ej]); } else { fprintf(out, "(null);"); } fprintf(out, "\n"); } fprintf(out, "edges out:\n"); edge_list_print(&self->edges_out_head, self->edges, out); fprintf(out, "edges in:\n"); edge_list_print(&self->edges_in_head, self->edges, out); } static int haplotype_extender_free(haplotype_extender_t *self) { tsk_blkalloc_free(&self->edge_list_heap); tsk_safe_free(self->last_degree); tsk_safe_free(self->next_degree); tsk_safe_free(self->last_nodes_edge); tsk_safe_free(self->next_nodes_edge); tsk_safe_free(self->parent_out); tsk_safe_free(self->parent_in); tsk_safe_free(self->not_sample); return 0; } static int haplotype_extender_next_tree(haplotype_extender_t *self, tsk_tree_position_t *tree_pos) { int ret = 0; tsk_id_t tj, e; edge_list_t *ex_out, *ex_in; edge_list_t *new_ex; const tsk_id_t *edges_child = self->edges->child; const tsk_id_t *edges_parent = self->edges->parent; for (ex_out = self->edges_out_head; ex_out != NULL; ex_out = ex_out->next) { e = ex_out->edge; self->parent_out[edges_child[e]] = TSK_NULL; // note we only adjust near_side of edges_in, not edges_out, // so no need to check for zero-length edges if (ex_out->extended > 1) { // this is needed to catch newly-created edges self->last_nodes_edge[edges_child[e]] = e; self->last_degree[edges_child[e]] += 1; self->last_degree[edges_parent[e]] += 1; } else if (ex_out->extended == 0) { self->last_nodes_edge[edges_child[e]] = TSK_NULL; self->last_degree[edges_child[e]] -= 1; self->last_degree[edges_parent[e]] -= 1; } } remove_unextended(&self->edges_out_head, &self->edges_out_tail); for (ex_in = self->edges_in_head; ex_in != NULL; ex_in = ex_in->next) { e = ex_in->edge; self->parent_in[edges_child[e]] = TSK_NULL; if (ex_in->extended == 0 && self->near_side[e] != self->far_side[e]) { self->last_nodes_edge[edges_child[e]] = e; self->last_degree[edges_child[e]] += 1; self->last_degree[edges_parent[e]] += 1; } } remove_unextended(&self->edges_in_head, &self->edges_in_tail); // done cleanup from last tree transition; // now we set the state up for this tree transition for (tj = tree_pos->out.start; tj != tree_pos->out.stop; tj += self->direction) { e = tree_pos->out.order[tj]; if (self->near_side[e] != self->far_side[e]) { new_ex = tsk_blkalloc_get(&self->edge_list_heap, sizeof(*new_ex)); if (new_ex == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } edge_list_append_entry( &self->edges_out_head, &self->edges_out_tail, new_ex, e, 0); } } for (ex_out = self->edges_out_head; ex_out != NULL; ex_out = ex_out->next) { e = ex_out->edge; self->parent_out[edges_child[e]] = edges_parent[e]; self->next_nodes_edge[edges_child[e]] = TSK_NULL; self->next_degree[edges_child[e]] -= 1; self->next_degree[edges_parent[e]] -= 1; } for (tj = tree_pos->in.start; tj != tree_pos->in.stop; tj += self->direction) { e = tree_pos->in.order[tj]; // add edge to pending_in new_ex = tsk_blkalloc_get(&self->edge_list_heap, sizeof(*new_ex)); if (new_ex == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } edge_list_append_entry(&self->edges_in_head, &self->edges_in_tail, new_ex, e, 0); } for (ex_in = self->edges_in_head; ex_in != NULL; ex_in = ex_in->next) { e = ex_in->edge; self->parent_in[edges_child[e]] = edges_parent[e]; self->next_nodes_edge[edges_child[e]] = e; self->next_degree[edges_child[e]] += 1; self->next_degree[edges_parent[e]] += 1; } out: return ret; } static int haplotype_extender_add_or_extend_edge(haplotype_extender_t *self, tsk_id_t new_parent, tsk_id_t child, double left, double right) { int ret = 0; double there; tsk_id_t old_edge, e_out, old_parent; edge_list_t *ex_in; edge_list_t *new_ex = NULL; tsk_id_t e_in; there = (self->direction == TSK_DIR_FORWARD) ? right : left; old_edge = self->next_nodes_edge[child]; if (old_edge != TSK_NULL) { old_parent = self->edges->parent[old_edge]; } else { old_parent = TSK_NULL; } if (new_parent != old_parent) { if (self->parent_out[child] == new_parent) { // if our new edge is in edges_out, it should be extended e_out = self->last_nodes_edge[child]; self->far_side[e_out] = there; edge_list_set_extended(&self->edges_out_head, e_out); } else { e_out = tsk_edge_table_add_row( self->edges, left, right, new_parent, child, NULL, 0); if (e_out < 0) { ret = (int) e_out; goto out; } /* pointers to left/right might have changed! */ if (self->direction == TSK_DIR_FORWARD) { self->near_side = self->edges->left; self->far_side = self->edges->right; } else { self->near_side = self->edges->right; self->far_side = self->edges->left; } new_ex = tsk_blkalloc_get(&self->edge_list_heap, sizeof(*new_ex)); if (new_ex == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } edge_list_append_entry( &self->edges_out_head, &self->edges_out_tail, new_ex, e_out, 2); } self->next_nodes_edge[child] = e_out; self->next_degree[child] += 1; self->next_degree[new_parent] += 1; self->parent_out[child] = TSK_NULL; if (old_edge != TSK_NULL) { for (ex_in = self->edges_in_head; ex_in != NULL; ex_in = ex_in->next) { e_in = ex_in->edge; if (e_in == old_edge) { self->near_side[e_in] = there; if (self->far_side[e_in] != there) { ex_in->extended = 1; } self->next_degree[child] -= 1; self->next_degree[self->parent_in[child]] -= 1; self->parent_in[child] = TSK_NULL; } } } } out: return ret; } static float haplotype_extender_mergeable(haplotype_extender_t *self, tsk_id_t c) { // returns the number of new edges needed // if the paths in parent_in and parent_out // up through nodes that aren't in the other tree // end at the same place and don't have conflicting times; // otherwise, return infinity tsk_id_t p_in, p_out, child; float num_new_edges; // needs to be float so we can have infinity int num_extended; double t_in, t_out; bool climb_in, climb_out; const double *nodes_time = self->ts->tables->nodes.time; p_out = self->parent_out[c]; p_in = self->parent_in[c]; t_out = (p_out == TSK_NULL) ? INFINITY : nodes_time[p_out]; t_in = (p_in == TSK_NULL) ? INFINITY : nodes_time[p_in]; child = c; num_new_edges = 0; num_extended = 0; while (true) { climb_in = (p_in != TSK_NULL && self->last_degree[p_in] == 0 && self->not_sample[p_in] && t_in < t_out); climb_out = (p_out != TSK_NULL && self->next_degree[p_out] == 0 && self->not_sample[p_out] && t_out < t_in); if (climb_in) { if (self->parent_in[child] != p_in) { num_new_edges += 1; } child = p_in; p_in = self->parent_in[p_in]; t_in = (p_in == TSK_NULL) ? INFINITY : nodes_time[p_in]; } else if (climb_out) { if (self->parent_out[child] != p_out) { num_new_edges += 1; } child = p_out; p_out = self->parent_out[p_out]; t_out = (p_out == TSK_NULL) ? INFINITY : nodes_time[p_out]; num_extended += 1; } else { break; } } if ((num_extended == 0) || (p_in != p_out) || (p_in == TSK_NULL)) { num_new_edges = INFINITY; } return num_new_edges; } static int haplotype_extender_merge_paths( haplotype_extender_t *self, tsk_id_t c, double left, double right) { int ret = 0; tsk_id_t p_in, p_out, child; double t_in, t_out; bool climb_in, climb_out; const double *nodes_time = self->ts->tables->nodes.time; p_out = self->parent_out[c]; p_in = self->parent_in[c]; t_out = nodes_time[p_out]; t_in = nodes_time[p_in]; child = c; while (true) { climb_in = (p_in != TSK_NULL && self->last_degree[p_in] == 0 && self->not_sample[p_in] && t_in < t_out); climb_out = (p_out != TSK_NULL && self->next_degree[p_out] == 0 && self->not_sample[p_out] && t_out < t_in); if (climb_in) { ret = haplotype_extender_add_or_extend_edge(self, p_in, child, left, right); if (ret != 0) { goto out; } child = p_in; p_in = self->parent_in[p_in]; t_in = (p_in == TSK_NULL) ? INFINITY : nodes_time[p_in]; } else if (climb_out) { ret = haplotype_extender_add_or_extend_edge(self, p_out, child, left, right); if (ret != 0) { goto out; } child = p_out; p_out = self->parent_out[p_out]; t_out = (p_out == TSK_NULL) ? INFINITY : nodes_time[p_out]; } else { break; } } tsk_bug_assert(p_out == p_in); ret = haplotype_extender_add_or_extend_edge(self, p_out, child, left, right); if (ret != 0) { goto out; } out: return ret; } static int haplotype_extender_extend_paths(haplotype_extender_t *self) { int ret = 0; bool valid; double left, right; float ne, max_new_edges, next_max_new_edges; tsk_tree_position_t tree_pos; edge_list_t *ex_in; tsk_id_t e_in, c, e; tsk_size_t num_edges; tsk_bool_t *keep = NULL; tsk_memset(&tree_pos, 0, sizeof(tree_pos)); ret = tsk_tree_position_init(&tree_pos, self->ts, 0); if (ret != 0) { goto out; } if (self->direction == TSK_DIR_FORWARD) { valid = tsk_tree_position_next(&tree_pos); } else { valid = tsk_tree_position_prev(&tree_pos); } while (valid) { left = tree_pos.interval.left; right = tree_pos.interval.right; ret = haplotype_extender_next_tree(self, &tree_pos); if (ret != 0) { goto out; } max_new_edges = 0; next_max_new_edges = INFINITY; while (max_new_edges < INFINITY) { for (ex_in = self->edges_in_head; ex_in != NULL; ex_in = ex_in->next) { e_in = ex_in->edge; c = self->edges->child[e_in]; if (self->last_degree[c] > 0) { ne = haplotype_extender_mergeable(self, c); if (ne <= max_new_edges) { ret = haplotype_extender_merge_paths(self, c, left, right); if (ret != 0) { goto out; } } else { next_max_new_edges = TSK_MIN(ne, next_max_new_edges); } } } max_new_edges = next_max_new_edges; next_max_new_edges = INFINITY; } if (self->direction == TSK_DIR_FORWARD) { valid = tsk_tree_position_next(&tree_pos); } else { valid = tsk_tree_position_prev(&tree_pos); } } /* Get rid of adjacent, identical edges */ /* note: we need to calloc this here instead of at the start * because we don't know how big it will need to be until now */ num_edges = self->edges->num_rows; keep = tsk_calloc(num_edges, sizeof(*keep)); if (keep == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (e = 0; e < (tsk_id_t) num_edges - 1; e++) { if (self->edges->parent[e] == self->edges->parent[e + 1] && self->edges->child[e] == self->edges->child[e + 1] && self->edges->right[e] == self->edges->left[e + 1]) { self->edges->right[e] = self->edges->right[e + 1]; self->edges->left[e + 1] = self->edges->right[e + 1]; } } for (e = 0; e < (tsk_id_t) num_edges; e++) { keep[e] = self->edges->left[e] < self->edges->right[e]; } ret = tsk_edge_table_keep_rows(self->edges, keep, 0, NULL); out: tsk_tree_position_free(&tree_pos); tsk_safe_free(keep); return ret; } static int extend_haplotypes_iter(const tsk_treeseq_t *self, int direction, tsk_edge_table_t *edges, tsk_flags_t options) { int ret = 0; haplotype_extender_t haplotype_extender; tsk_memset(&haplotype_extender, 0, sizeof(haplotype_extender)); ret = haplotype_extender_init(&haplotype_extender, self, direction, edges); if (ret != 0) { goto out; } ret = haplotype_extender_extend_paths(&haplotype_extender); if (ret != 0) { goto out; } if (!!(options & TSK_DEBUG)) { haplotype_extender_print_state(&haplotype_extender, tsk_get_debug_stream()); } out: haplotype_extender_free(&haplotype_extender); return ret; } int TSK_WARN_UNUSED tsk_treeseq_extend_haplotypes( const tsk_treeseq_t *self, int max_iter, tsk_flags_t options, tsk_treeseq_t *output) { int ret = 0; tsk_table_collection_t tables; tsk_treeseq_t ts; int iter, j; tsk_size_t last_num_edges; tsk_bookmark_t sort_start; const int direction[] = { TSK_DIR_FORWARD, TSK_DIR_REVERSE }; tsk_memset(&tables, 0, sizeof(tables)); tsk_memset(&ts, 0, sizeof(ts)); tsk_memset(output, 0, sizeof(*output)); if (max_iter <= 0) { ret = tsk_trace_error(TSK_ERR_EXTEND_EDGES_BAD_MAXITER); goto out; } if (tsk_treeseq_get_num_migrations(self) != 0) { ret = tsk_trace_error(TSK_ERR_MIGRATIONS_NOT_SUPPORTED); goto out; } /* Note: there is a fair bit of copying of table data in this implementation * currently, as we create a new tree sequence for each iteration, which * takes a full copy of the input tables. We could streamline this by * adding a flag to treeseq_init which says "steal a reference to these * tables and *don't* free them at the end". Then, we would only need * one copy of the full tables, and could pass in a standalone edge * table to use for in-place updating. */ ret = tsk_table_collection_copy(self->tables, &tables, 0); if (ret != 0) { goto out; } ret = tsk_mutation_table_clear(&tables.mutations); if (ret != 0) { goto out; } ret = tsk_treeseq_init(&ts, &tables, 0); if (ret != 0) { goto out; } last_num_edges = tsk_treeseq_get_num_edges(&ts); for (iter = 0; iter < max_iter; iter++) { for (j = 0; j < 2; j++) { ret = extend_haplotypes_iter(&ts, direction[j], &tables.edges, options); if (ret != 0) { goto out; } /* We're done with the current ts now */ tsk_treeseq_free(&ts); /* no need to sort sites and mutations */ memset(&sort_start, 0, sizeof(sort_start)); sort_start.sites = tables.sites.num_rows; sort_start.mutations = tables.mutations.num_rows; ret = tsk_table_collection_sort(&tables, &sort_start, 0); if (ret != 0) { goto out; } ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); if (ret != 0) { goto out; } } if (last_num_edges == tsk_treeseq_get_num_edges(&ts)) { break; } last_num_edges = tsk_treeseq_get_num_edges(&ts); } /* Remap mutation nodes */ ret = tsk_mutation_table_copy( &self->tables->mutations, &tables.mutations, TSK_NO_INIT); if (ret != 0) { goto out; } /* Note: to allow migrations we'd also have to do this same operation * on the migration nodes; however it's a can of worms because the interval * covering the migration might no longer make sense. */ ret = tsk_treeseq_slide_mutation_nodes_up(&ts, &tables.mutations); if (ret != 0) { goto out; } tsk_treeseq_free(&ts); ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); if (ret != 0) { goto out; } /* Hand ownership of the tree sequence to the calling code */ tsk_memcpy(output, &ts, sizeof(ts)); tsk_memset(&ts, 0, sizeof(*output)); out: tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); return ret; } /* ======================================================== * * Pair coalescence * ======================================================== */ static int check_node_bin_map( const tsk_size_t num_nodes, const tsk_size_t num_bins, const tsk_id_t *node_bin_map) { int ret = 0; tsk_id_t max_index, index; tsk_size_t i; max_index = TSK_NULL; for (i = 0; i < num_nodes; i++) { index = node_bin_map[i]; if (index < TSK_NULL) { ret = tsk_trace_error(TSK_ERR_BAD_NODE_BIN_MAP); goto out; } if (index > max_index) { max_index = index; } } if (num_bins < 1 || (tsk_id_t) num_bins < max_index + 1) { ret = tsk_trace_error(TSK_ERR_BAD_NODE_BIN_MAP_DIM); goto out; } out: return ret; } static inline void TRANSPOSE_2D(tsk_size_t rows, tsk_size_t cols, const double *source, double *dest) { tsk_size_t i, j; for (i = 0; i < rows; ++i) { for (j = 0; j < cols; ++j) { dest[j * rows + i] = source[i * cols + j]; } } } static inline void pair_coalescence_count(tsk_size_t num_set_indexes, const tsk_id_t *set_indexes, tsk_size_t num_sample_sets, const double *parent_count, const double *child_count, const double *parent_state, const double *inside, double *outside, double *result) { tsk_size_t i; tsk_id_t j, k; for (i = 0; i < num_sample_sets; i++) { outside[i] = parent_count[i] - child_count[i] - parent_state[i]; } for (i = 0; i < num_set_indexes; i++) { j = set_indexes[2 * i]; k = set_indexes[2 * i + 1]; result[i] = outside[j] * inside[k]; if (j != k) { result[i] += outside[k] * inside[j]; } } } int tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes, tsk_size_t num_windows, const double *windows, tsk_size_t num_bins, const tsk_id_t *node_bin_map, pair_coalescence_stat_func_t *summary_func, tsk_size_t summary_func_dim, void *summary_func_args, tsk_flags_t options, double *result) { int ret = 0; double left, right, remaining_span, missing_span, window_span, denominator, x, t; tsk_id_t e, p, c, u, v, w, i, j; tsk_size_t num_samples, num_edges; tsk_tree_position_t tree_pos; const tsk_table_collection_t *tables = self->tables; const tsk_size_t num_nodes = tables->nodes.num_rows; const double *restrict nodes_time = self->tables->nodes.time; const double sequence_length = tables->sequence_length; const tsk_size_t num_outputs = summary_func_dim; /* buffers */ bool *visited = NULL; tsk_id_t *nodes_sample_set = NULL; tsk_id_t *nodes_parent = NULL; double *coalescing_pairs = NULL; double *coalescence_time = NULL; double *nodes_sample = NULL; double *sample_count = NULL; double *bin_weight = NULL; double *bin_values = NULL; double *pair_count = NULL; double *total_pair = NULL; double *outside = NULL; /* row pointers */ double *inside = NULL; double *weight = NULL; double *values = NULL; double *output = NULL; double *above = NULL; double *below = NULL; double *state = NULL; double *pairs = NULL; double *times = NULL; tsk_memset(&tree_pos, 0, sizeof(tree_pos)); /* check inputs */ ret = tsk_treeseq_check_windows(self, num_windows, windows, TSK_REQUIRE_FULL_SPAN); if (ret != 0) { goto out; } ret = check_set_indexes(num_sample_sets, 2 * num_set_indexes, set_indexes); if (ret != 0) { goto out; } ret = tsk_treeseq_check_sample_sets( self, num_sample_sets, sample_set_sizes, sample_sets); if (ret != 0) { goto out; } ret = check_node_bin_map(num_nodes, num_bins, node_bin_map); if (ret != 0) { goto out; } /* map nodes to sample sets */ nodes_sample_set = tsk_malloc(num_nodes * sizeof(*nodes_sample_set)); if (nodes_sample_set == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } ret = get_sample_set_index_map(self, num_sample_sets, sample_set_sizes, sample_sets, &num_samples, nodes_sample_set); if (ret != 0) { goto out; } visited = tsk_malloc(num_nodes * sizeof(*visited)); outside = tsk_malloc(num_sample_sets * sizeof(*outside)); nodes_parent = tsk_malloc(num_nodes * sizeof(*nodes_parent)); nodes_sample = tsk_calloc(num_nodes * num_sample_sets, sizeof(*nodes_sample)); sample_count = tsk_malloc(num_nodes * num_sample_sets * sizeof(*sample_count)); coalescing_pairs = tsk_calloc(num_bins * num_set_indexes, sizeof(*coalescing_pairs)); coalescence_time = tsk_calloc(num_bins * num_set_indexes, sizeof(*coalescence_time)); bin_weight = tsk_malloc(num_bins * num_set_indexes * sizeof(*bin_weight)); bin_values = tsk_malloc(num_bins * num_set_indexes * sizeof(*bin_values)); pair_count = tsk_malloc(num_set_indexes * sizeof(*pair_count)); total_pair = tsk_malloc(num_set_indexes * sizeof(*total_pair)); if (nodes_parent == NULL || nodes_sample == NULL || sample_count == NULL || coalescing_pairs == NULL || bin_weight == NULL || bin_values == NULL || outside == NULL || pair_count == NULL || visited == NULL || total_pair == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (i = 0; i < (tsk_id_t) num_set_indexes; i++) { u = set_indexes[2 * i]; v = set_indexes[2 * i + 1]; total_pair[i] = (double) sample_set_sizes[u] * (double) sample_set_sizes[v]; if (u == v) { total_pair[i] -= (double) sample_set_sizes[v]; total_pair[i] /= 2; } } /* initialize internal state */ for (c = 0; c < (tsk_id_t) num_nodes; c++) { i = nodes_sample_set[c]; if (i != TSK_NULL) { state = GET_2D_ROW(nodes_sample, num_sample_sets, c); state[i] = 1.0; } nodes_parent[c] = TSK_NULL; visited[c] = false; } tsk_memcpy( sample_count, nodes_sample, num_nodes * num_sample_sets * sizeof(*sample_count)); ret = tsk_tree_position_init(&tree_pos, self, 0); if (ret != 0) { goto out; } num_edges = 0; missing_span = 0.0; w = 0; while (true) { tsk_tree_position_next(&tree_pos); if (tree_pos.index == TSK_NULL) { break; } left = tree_pos.interval.left; right = tree_pos.interval.right; remaining_span = sequence_length - left; for (u = tree_pos.out.start; u != tree_pos.out.stop; u++) { e = tree_pos.out.order[u]; p = tables->edges.parent[e]; c = tables->edges.child[e]; nodes_parent[c] = TSK_NULL; inside = GET_2D_ROW(sample_count, num_sample_sets, c); while (p != TSK_NULL) { /* downdate statistic */ v = node_bin_map[p]; t = nodes_time[p]; if (v != TSK_NULL) { above = GET_2D_ROW(sample_count, num_sample_sets, p); below = GET_2D_ROW(sample_count, num_sample_sets, c); state = GET_2D_ROW(nodes_sample, num_sample_sets, p); pairs = GET_2D_ROW(coalescing_pairs, num_set_indexes, v); times = GET_2D_ROW(coalescence_time, num_set_indexes, v); pair_coalescence_count(num_set_indexes, set_indexes, num_sample_sets, above, below, state, inside, outside, pair_count); for (i = 0; i < (tsk_id_t) num_set_indexes; i++) { x = pair_count[i] * remaining_span; pairs[i] -= x; times[i] -= t * x; } } c = p; p = nodes_parent[c]; } p = tables->edges.parent[e]; while (p != TSK_NULL) { /* downdate state */ above = GET_2D_ROW(sample_count, num_sample_sets, p); for (i = 0; i < (tsk_id_t) num_sample_sets; i++) { above[i] -= inside[i]; } p = nodes_parent[p]; } num_edges -= 1; } for (u = tree_pos.in.start; u != tree_pos.in.stop; u++) { e = tree_pos.in.order[u]; p = tables->edges.parent[e]; c = tables->edges.child[e]; nodes_parent[c] = p; inside = GET_2D_ROW(sample_count, num_sample_sets, c); while (p != TSK_NULL) { /* update state */ above = GET_2D_ROW(sample_count, num_sample_sets, p); for (i = 0; i < (tsk_id_t) num_sample_sets; i++) { above[i] += inside[i]; } p = nodes_parent[p]; } p = tables->edges.parent[e]; while (p != TSK_NULL) { /* update statistic */ v = node_bin_map[p]; t = nodes_time[p]; if (v != TSK_NULL) { above = GET_2D_ROW(sample_count, num_sample_sets, p); below = GET_2D_ROW(sample_count, num_sample_sets, c); state = GET_2D_ROW(nodes_sample, num_sample_sets, p); pairs = GET_2D_ROW(coalescing_pairs, num_set_indexes, v); times = GET_2D_ROW(coalescence_time, num_set_indexes, v); pair_coalescence_count(num_set_indexes, set_indexes, num_sample_sets, above, below, state, inside, outside, pair_count); for (i = 0; i < (tsk_id_t) num_set_indexes; i++) { x = pair_count[i] * remaining_span; pairs[i] += x; times[i] += t * x; } } c = p; p = nodes_parent[c]; } num_edges += 1; } if (num_edges == 0) { missing_span += right - left; } /* flush windows */ while (w < (tsk_id_t) num_windows && windows[w + 1] <= right) { TRANSPOSE_2D(num_bins, num_set_indexes, coalescing_pairs, bin_weight); TRANSPOSE_2D(num_bins, num_set_indexes, coalescence_time, bin_values); tsk_memset(coalescing_pairs, 0, num_bins * num_set_indexes * sizeof(*coalescing_pairs)); tsk_memset(coalescence_time, 0, num_bins * num_set_indexes * sizeof(*coalescence_time)); remaining_span = sequence_length - windows[w + 1]; for (j = 0; j < (tsk_id_t) num_samples; j++) { /* truncate at tree */ c = sample_sets[j]; p = nodes_parent[c]; while (!visited[c] && p != TSK_NULL) { v = node_bin_map[p]; t = nodes_time[p]; if (v != TSK_NULL) { above = GET_2D_ROW(sample_count, num_sample_sets, p); below = GET_2D_ROW(sample_count, num_sample_sets, c); state = GET_2D_ROW(nodes_sample, num_sample_sets, p); pairs = GET_2D_ROW(coalescing_pairs, num_set_indexes, v); times = GET_2D_ROW(coalescence_time, num_set_indexes, v); pair_coalescence_count(num_set_indexes, set_indexes, num_sample_sets, above, below, state, below, outside, pair_count); for (i = 0; i < (tsk_id_t) num_set_indexes; i++) { weight = GET_2D_ROW(bin_weight, num_bins, i); values = GET_2D_ROW(bin_values, num_bins, i); x = pair_count[i] * remaining_span / 2; pairs[i] += x; times[i] += t * x; weight[v] -= x; values[v] -= t * x; } } visited[c] = true; c = p; p = nodes_parent[c]; } } for (j = 0; j < (tsk_id_t) num_samples; j++) { /* reset tree */ c = sample_sets[j]; p = nodes_parent[c]; while (visited[c] && p != TSK_NULL) { visited[c] = false; c = p; p = nodes_parent[c]; } } for (i = 0; i < (tsk_id_t) num_set_indexes; i++) { /* normalise values */ weight = GET_2D_ROW(bin_weight, num_bins, i); values = GET_2D_ROW(bin_values, num_bins, i); for (v = 0; v < (tsk_id_t) num_bins; v++) { values[v] /= weight[v]; } } /* normalise weights */ if (options & (TSK_STAT_SPAN_NORMALISE | TSK_STAT_PAIR_NORMALISE)) { window_span = windows[w + 1] - windows[w] - missing_span; missing_span = 0.0; if (num_edges == 0) { /* missing interval, so remove overcounted missing span */ remaining_span = right - windows[w + 1]; window_span += remaining_span; missing_span += remaining_span; } for (i = 0; i < (tsk_id_t) num_set_indexes; i++) { denominator = 1.0; if (options & TSK_STAT_SPAN_NORMALISE) { denominator *= window_span; } if (options & TSK_STAT_PAIR_NORMALISE) { denominator *= total_pair[i]; } weight = GET_2D_ROW(bin_weight, num_bins, i); for (v = 0; v < (tsk_id_t) num_bins; v++) { weight[v] *= denominator == 0.0 ? 0.0 : 1 / denominator; } } } for (i = 0; i < (tsk_id_t) num_set_indexes; i++) { /* summarise bins */ weight = GET_2D_ROW(bin_weight, num_bins, i); values = GET_2D_ROW(bin_values, num_bins, i); output = GET_3D_ROW( result, num_set_indexes, num_outputs, (tsk_size_t) w, i); ret = summary_func( num_bins, weight, values, num_outputs, output, summary_func_args); if (ret != 0) { goto out; } }; w += 1; } } out: tsk_tree_position_free(&tree_pos); tsk_safe_free(nodes_sample_set); tsk_safe_free(coalescing_pairs); tsk_safe_free(coalescence_time); tsk_safe_free(nodes_parent); tsk_safe_free(nodes_sample); tsk_safe_free(sample_count); tsk_safe_free(bin_weight); tsk_safe_free(bin_values); tsk_safe_free(pair_count); tsk_safe_free(total_pair); tsk_safe_free(visited); tsk_safe_free(outside); return ret; } static int pair_coalescence_weights(tsk_size_t TSK_UNUSED(input_dim), const double *weight, const double *TSK_UNUSED(values), tsk_size_t output_dim, double *output, void *TSK_UNUSED(params)) { int ret = 0; tsk_memcpy(output, weight, output_dim * sizeof(*output)); return ret; } int tsk_treeseq_pair_coalescence_counts(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes, tsk_size_t num_windows, const double *windows, tsk_size_t num_bins, const tsk_id_t *node_bin_map, tsk_flags_t options, double *result) { return tsk_treeseq_pair_coalescence_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_set_indexes, set_indexes, num_windows, windows, num_bins, node_bin_map, pair_coalescence_weights, num_bins, NULL, options, result); } static int pair_coalescence_quantiles(tsk_size_t input_dim, const double *weight, const double *values, tsk_size_t output_dim, double *output, void *params) { int ret = 0; double coalesced, timepoint; double *quantiles = (double *) params; tsk_size_t i, j; j = 0; coalesced = 0.0; timepoint = TSK_UNKNOWN_TIME; for (i = 0; i < output_dim; i++) { output[i] = NAN; } for (i = 0; i < input_dim; i++) { if (weight[i] > 0) { coalesced += weight[i]; timepoint = values[i]; while (j < output_dim && quantiles[j] <= coalesced) { output[j] = timepoint; j += 1; } } } if (quantiles[output_dim - 1] == 1.0) { output[output_dim - 1] = timepoint; } return ret; } static int check_quantiles(const tsk_size_t num_quantiles, const double *quantiles) { int ret = 0; tsk_size_t i; double last = -INFINITY; for (i = 0; i < num_quantiles; i++) { if (quantiles[i] <= last || quantiles[i] < 0.0 || quantiles[i] > 1.0) { ret = tsk_trace_error(TSK_ERR_BAD_QUANTILES); goto out; } last = quantiles[i]; } out: return ret; } static int check_sorted_node_bin_map( const tsk_treeseq_t *self, tsk_size_t num_bins, const tsk_id_t *node_bin_map) { int ret = 0; tsk_size_t num_nodes = self->tables->nodes.num_rows; const double *nodes_time = self->tables->nodes.time; double last; tsk_id_t i, j; double *min_time = tsk_malloc(num_bins * sizeof(*min_time)); double *max_time = tsk_malloc(num_bins * sizeof(*max_time)); if (min_time == NULL || max_time == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < (tsk_id_t) num_bins; j++) { min_time[j] = TSK_UNKNOWN_TIME; max_time[j] = TSK_UNKNOWN_TIME; } for (i = 0; i < (tsk_id_t) num_nodes; i++) { j = node_bin_map[i]; if (j < 0 || j >= (tsk_id_t) num_bins) { continue; } if (tsk_is_unknown_time(max_time[j]) || nodes_time[i] > max_time[j]) { max_time[j] = nodes_time[i]; } if (tsk_is_unknown_time(min_time[j]) || nodes_time[i] < min_time[j]) { min_time[j] = nodes_time[i]; } } last = -INFINITY; for (j = 0; j < (tsk_id_t) num_bins; j++) { if (tsk_is_unknown_time(min_time[j])) { continue; } if (min_time[j] < last) { ret = tsk_trace_error(TSK_ERR_UNSORTED_TIMES); goto out; } else { last = max_time[j]; } } out: tsk_safe_free(min_time); tsk_safe_free(max_time); return ret; } int tsk_treeseq_pair_coalescence_quantiles(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes, tsk_size_t num_windows, const double *windows, tsk_size_t num_bins, const tsk_id_t *node_bin_map, tsk_size_t num_quantiles, double *quantiles, tsk_flags_t options, double *result) { int ret = 0; void *params = (void *) quantiles; ret = check_quantiles(num_quantiles, quantiles); if (ret != 0) { goto out; } ret = check_sorted_node_bin_map(self, num_bins, node_bin_map); if (ret != 0) { goto out; } options |= TSK_STAT_SPAN_NORMALISE | TSK_STAT_PAIR_NORMALISE; ret = tsk_treeseq_pair_coalescence_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_set_indexes, set_indexes, num_windows, windows, num_bins, node_bin_map, pair_coalescence_quantiles, num_quantiles, params, options, result); if (ret != 0) { goto out; } out: return ret; } static int pair_coalescence_rates(tsk_size_t input_dim, const double *weight, const double *values, tsk_size_t output_dim, double *output, void *params) { int ret = 0; double coalesced, rate, waiting_time, a, b; double *time_windows = (double *) params; tsk_id_t i, j; tsk_bug_assert(input_dim == output_dim); for (j = (tsk_id_t) output_dim; j > 0; j--) { /* find last window with data */ if (weight[j - 1] == 0) { output[j - 1] = NAN; /* TODO: should fill value be zero instead? */ } else { break; } } coalesced = 0.0; for (i = 0; i < j; i++) { a = time_windows[i]; b = time_windows[i + 1]; if (i + 1 == j) { waiting_time = values[i] < a ? 0.0 : values[i] - a; rate = 1 / waiting_time; } else { rate = log(1 - weight[i] / (1 - coalesced)) / (a - b); } // avoid tiny negative values from fp error output[i] = rate > 0 ? rate : 0; coalesced += weight[i]; } return ret; } static int check_coalescence_rate_time_windows(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_time_windows, const tsk_id_t *node_time_window, const double *time_windows) { int ret = 0; double timepoint; const double *nodes_time = self->tables->nodes.time; tsk_size_t num_nodes = self->tables->nodes.num_rows; tsk_id_t i, j, k; tsk_id_t n; if (num_time_windows == 0) { ret = tsk_trace_error(TSK_ERR_BAD_TIME_WINDOWS_DIM); goto out; } /* time windows are sorted */ timepoint = time_windows[0]; for (i = 0; i < (tsk_id_t) num_time_windows; i++) { if (time_windows[i + 1] <= timepoint) { ret = tsk_trace_error(TSK_ERR_BAD_TIME_WINDOWS); goto out; } timepoint = time_windows[i + 1]; } if (timepoint != INFINITY) { ret = tsk_trace_error(TSK_ERR_BAD_TIME_WINDOWS_END); goto out; } /* all sample times align with start of first time window */ k = 0; for (i = 0; i < (tsk_id_t) num_sample_sets; i++) { for (j = 0; j < (tsk_id_t) sample_set_sizes[i]; j++) { n = sample_sets[k++]; if (nodes_time[n] != time_windows[0]) { ret = tsk_trace_error(TSK_ERR_BAD_SAMPLE_PAIR_TIMES); goto out; } } } /* nodes are correctly assigned to time windows */ for (i = 0; i < (tsk_id_t) num_nodes; i++) { j = node_time_window[i]; if (j < 0) { continue; } if (j >= (tsk_id_t) num_time_windows) { ret = tsk_trace_error(TSK_ERR_BAD_NODE_BIN_MAP_DIM); goto out; } if (nodes_time[i] < time_windows[j] || nodes_time[i] >= time_windows[j + 1]) { ret = tsk_trace_error(TSK_ERR_BAD_NODE_TIME_WINDOW); goto out; } } out: return ret; } int tsk_treeseq_pair_coalescence_rates(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes, tsk_size_t num_windows, const double *windows, tsk_size_t num_time_windows, const tsk_id_t *node_time_window, double *time_windows, tsk_flags_t options, double *result) { int ret = 0; void *params = (void *) time_windows; ret = check_coalescence_rate_time_windows(self, num_sample_sets, sample_set_sizes, sample_sets, num_time_windows, node_time_window, time_windows); if (ret != 0) { goto out; } options |= TSK_STAT_SPAN_NORMALISE | TSK_STAT_PAIR_NORMALISE; ret = tsk_treeseq_pair_coalescence_stat(self, num_sample_sets, sample_set_sizes, sample_sets, num_set_indexes, set_indexes, num_windows, windows, num_time_windows, node_time_window, pair_coalescence_rates, num_time_windows, params, options, result); if (ret != 0) { goto out; } out: return ret; } /* ======================================================== * * Relatedness matrix-vector product * ======================================================== */ typedef struct { const tsk_treeseq_t *ts; tsk_size_t num_weights; const double *weights; tsk_size_t num_windows; const double *windows; tsk_size_t num_focal_nodes; const tsk_id_t *focal_nodes; tsk_flags_t options; double *result; tsk_tree_position_t tree_pos; double position; tsk_size_t num_nodes; tsk_id_t *parent; double *x; double *w; double *v; } tsk_matvec_calculator_t; static void tsk_matvec_calculator_print_state(const tsk_matvec_calculator_t *self, FILE *out) { tsk_id_t j; tsk_size_t num_samples = tsk_treeseq_get_num_samples(self->ts); fprintf(out, "Matvec state:\n"); fprintf(out, "options = %d\n", self->options); fprintf(out, "position = %f\n", self->position); fprintf(out, "focal nodes = %lld: [", (long long) self->num_focal_nodes); fprintf(out, "tree_pos:\n"); tsk_tree_position_print_state(&self->tree_pos, out); fprintf(out, "samples = %lld: [", (long long) num_samples); fprintf(out, "]\n"); fprintf(out, "node\tparent\tx\tv\tw"); fprintf(out, "\n"); for (j = 0; j < (tsk_id_t) self->num_nodes; j++) { fprintf(out, "%lld\t", (long long) j); fprintf(out, "%lld\t%g\t%g\t%g\n", (long long) self->parent[j], self->x[j], self->v[j], self->w[j]); } } static int tsk_matvec_calculator_init(tsk_matvec_calculator_t *self, const tsk_treeseq_t *ts, tsk_size_t num_weights, const double *weights, tsk_size_t num_windows, const double *windows, tsk_size_t num_focal_nodes, const tsk_id_t *focal_nodes, tsk_flags_t options, double *result) { int ret = 0; tsk_size_t num_samples = tsk_treeseq_get_num_samples(ts); const tsk_size_t num_nodes = ts->tables->nodes.num_rows; const double *row; double *new_row; tsk_size_t k; tsk_id_t index, u, j; double *weight_means = tsk_malloc(num_weights * sizeof(*weight_means)); const tsk_size_t num_trees = ts->num_trees; const double *restrict breakpoints = ts->breakpoints; self->ts = ts; self->num_weights = num_weights; self->weights = weights; self->num_windows = num_windows; self->windows = windows; self->num_focal_nodes = num_focal_nodes; self->focal_nodes = focal_nodes; self->options = options; self->result = result; self->num_nodes = num_nodes; self->position = windows[0]; self->parent = tsk_malloc(num_nodes * sizeof(*self->parent)); self->x = tsk_calloc(num_nodes, sizeof(*self->x)); self->v = tsk_calloc(num_nodes * num_weights, sizeof(*self->v)); self->w = tsk_calloc(num_nodes * num_weights, sizeof(*self->w)); if (self->parent == NULL || self->x == NULL || self->w == NULL || self->v == NULL || weight_means == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } tsk_memset(result, 0, num_windows * num_focal_nodes * num_weights * sizeof(*result)); tsk_memset(self->parent, TSK_NULL, num_nodes * sizeof(*self->parent)); for (j = 0; j < (tsk_id_t) num_focal_nodes; j++) { if (focal_nodes[j] < 0 || (tsk_size_t) focal_nodes[j] >= num_nodes) { ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS); goto out; } } ret = tsk_tree_position_init(&self->tree_pos, ts, 0); if (ret != 0) { goto out; } /* seek to the first window */ index = (tsk_id_t) tsk_search_sorted(breakpoints, num_trees + 1, windows[0]); if (breakpoints[index] > windows[0]) { index--; } ret = tsk_tree_position_seek_forward(&self->tree_pos, index); if (ret != 0) { goto out; } for (k = 0; k < num_weights; k++) { weight_means[k] = 0.0; } /* centre the input */ if (!(options & TSK_STAT_NONCENTRED)) { for (j = 0; j < (tsk_id_t) num_samples; j++) { row = GET_2D_ROW(weights, num_weights, j); for (k = 0; k < num_weights; k++) { weight_means[k] += row[k]; } } for (k = 0; k < num_weights; k++) { weight_means[k] /= (double) num_samples; } } /* set the initial state */ for (j = 0; j < (tsk_id_t) num_samples; j++) { u = ts->samples[j]; row = GET_2D_ROW(weights, num_weights, j); new_row = GET_2D_ROW(self->w, num_weights, u); for (k = 0; k < num_weights; k++) { new_row[k] = row[k] - weight_means[k]; } } out: tsk_safe_free(weight_means); return ret; } static int tsk_matvec_calculator_free(tsk_matvec_calculator_t *self) { tsk_safe_free(self->parent); tsk_safe_free(self->x); tsk_safe_free(self->w); tsk_safe_free(self->v); tsk_tree_position_free(&self->tree_pos); /* Make this safe for multiple free calls */ memset(self, 0, sizeof(*self)); return 0; } static inline void tsk_matvec_calculator_add_z(tsk_id_t u, tsk_id_t p, const double position, double *restrict x, const tsk_size_t num_weights, double *restrict w, double *restrict v, const double *restrict nodes_time) { double t, span; tsk_size_t j; double *restrict v_row, *restrict w_row; if (p != TSK_NULL) { t = nodes_time[p] - nodes_time[u]; span = position - x[u]; // do this: self->v[u] += t * span * self->w[u]; w_row = GET_2D_ROW(w, num_weights, u); v_row = GET_2D_ROW(v, num_weights, u); for (j = 0; j < num_weights; j++) { v_row[j] += t * span * w_row[j]; } } x[u] = position; } static void tsk_matvec_calculator_adjust_path_up( tsk_matvec_calculator_t *self, tsk_id_t p, tsk_id_t c, double sign) { tsk_size_t j; double *p_row, *c_row; const tsk_id_t *restrict parent = self->parent; const double position = self->position; double *restrict x = self->x; const tsk_size_t num_weights = self->num_weights; double *restrict w = self->w; double *restrict v = self->v; const double *restrict nodes_time = self->ts->tables->nodes.time; // sign = -1 for removing edges, +1 for adding while (p != TSK_NULL) { tsk_matvec_calculator_add_z( p, parent[p], position, x, num_weights, w, v, nodes_time); // do this: self->v[c] -= sign * self->v[p]; p_row = GET_2D_ROW(v, num_weights, p); c_row = GET_2D_ROW(v, num_weights, c); for (j = 0; j < num_weights; j++) { c_row[j] -= sign * p_row[j]; } // do this: self->w[p] += sign * self->w[c]; p_row = GET_2D_ROW(w, num_weights, p); c_row = GET_2D_ROW(w, num_weights, c); for (j = 0; j < num_weights; j++) { p_row[j] += sign * c_row[j]; } p = parent[p]; } } static void tsk_matvec_calculator_remove_edge(tsk_matvec_calculator_t *self, tsk_id_t p, tsk_id_t c) { tsk_id_t *parent = self->parent; const double position = self->position; double *restrict x = self->x; const tsk_size_t num_weights = self->num_weights; double *restrict w = self->w; double *restrict v = self->v; const double *restrict nodes_time = self->ts->tables->nodes.time; tsk_matvec_calculator_add_z( c, parent[c], position, x, num_weights, w, v, nodes_time); parent[c] = TSK_NULL; tsk_matvec_calculator_adjust_path_up(self, p, c, -1); } static void tsk_matvec_calculator_insert_edge(tsk_matvec_calculator_t *self, tsk_id_t p, tsk_id_t c) { tsk_id_t *parent = self->parent; tsk_matvec_calculator_adjust_path_up(self, p, c, +1); self->x[c] = self->position; parent[c] = p; } static int tsk_matvec_calculator_write_output(tsk_matvec_calculator_t *self, double *restrict y) { int ret = 0; tsk_id_t u; tsk_size_t j, k; const tsk_size_t n = self->num_focal_nodes; const tsk_size_t num_weights = self->num_weights; const double position = self->position; double *u_row, *out_row; double *out_means = tsk_malloc(num_weights * sizeof(*out_means)); const tsk_id_t *restrict parent = self->parent; const double *restrict nodes_time = self->ts->tables->nodes.time; double *restrict x = self->x; double *restrict w = self->w; double *restrict v = self->v; const tsk_id_t *restrict focal_nodes = self->focal_nodes; if (out_means == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } for (j = 0; j < n; j++) { out_row = GET_2D_ROW(y, num_weights, j); u = focal_nodes[j]; while (u != TSK_NULL) { if (x[u] != position) { tsk_matvec_calculator_add_z( u, parent[u], position, x, num_weights, w, v, nodes_time); } u_row = GET_2D_ROW(v, num_weights, u); for (k = 0; k < num_weights; k++) { out_row[k] += u_row[k]; } u = parent[u]; } } if (!(self->options & TSK_STAT_NONCENTRED)) { for (k = 0; k < num_weights; k++) { out_means[k] = 0.0; } for (j = 0; j < n; j++) { out_row = GET_2D_ROW(y, num_weights, j); for (k = 0; k < num_weights; k++) { out_means[k] += out_row[k]; } } for (k = 0; k < num_weights; k++) { out_means[k] /= (double) n; } for (j = 0; j < n; j++) { out_row = GET_2D_ROW(y, num_weights, j); for (k = 0; k < num_weights; k++) { out_row[k] -= out_means[k]; } } } /* zero out v */ tsk_memset(self->v, 0, self->num_nodes * num_weights * sizeof(*self->v)); out: tsk_safe_free(out_means); return ret; } static int tsk_matvec_calculator_run(tsk_matvec_calculator_t *self) { int ret = 0; tsk_size_t j, k, m; tsk_id_t e, p, c; const tsk_size_t out_size = self->num_weights * self->num_focal_nodes; const tsk_size_t num_edges = self->ts->tables->edges.num_rows; const double *restrict edge_right = self->ts->tables->edges.right; const double *restrict edge_left = self->ts->tables->edges.left; const tsk_id_t *restrict edge_child = self->ts->tables->edges.child; const tsk_id_t *restrict edge_parent = self->ts->tables->edges.parent; const double *restrict windows = self->windows; double *restrict out; tsk_tree_position_t tree_pos = self->tree_pos; const tsk_id_t *restrict in_order = tree_pos.in.order; const tsk_id_t *restrict out_order = tree_pos.out.order; bool valid; double next_position; m = 0; self->position = windows[0]; for (j = (tsk_size_t) tree_pos.in.start; j != (tsk_size_t) tree_pos.in.stop; j++) { e = in_order[j]; tsk_bug_assert(edge_left[e] <= self->position); if (self->position < edge_right[e]) { p = edge_parent[e]; c = edge_child[e]; tsk_matvec_calculator_insert_edge(self, p, c); } } valid = tsk_tree_position_next(&tree_pos); j = (tsk_size_t) tree_pos.in.start; k = (tsk_size_t) tree_pos.out.start; while (m < self->num_windows) { if (valid && self->position == tree_pos.interval.left) { for (k = (tsk_size_t) tree_pos.out.start; k != (tsk_size_t) tree_pos.out.stop; k++) { e = out_order[k]; p = edge_parent[e]; c = edge_child[e]; tsk_matvec_calculator_remove_edge(self, p, c); } for (j = (tsk_size_t) tree_pos.in.start; j != (tsk_size_t) tree_pos.in.stop; j++) { e = in_order[j]; p = edge_parent[e]; c = edge_child[e]; tsk_matvec_calculator_insert_edge(self, p, c); } valid = tsk_tree_position_next(&tree_pos); } next_position = windows[m + 1]; if (j < num_edges) { next_position = TSK_MIN(next_position, edge_left[in_order[j]]); } if (k < num_edges) { next_position = TSK_MIN(next_position, edge_right[out_order[k]]); } tsk_bug_assert(self->position < next_position); self->position = next_position; if (self->position == windows[m + 1]) { out = GET_2D_ROW(self->result, out_size, m); tsk_matvec_calculator_write_output(self, out); m += 1; } if (self->options & TSK_DEBUG) { tsk_matvec_calculator_print_state(self, tsk_get_debug_stream()); } } if (!!(self->options & TSK_STAT_SPAN_NORMALISE)) { span_normalise(self->num_windows, windows, out_size, self->result); } /* out: */ return ret; } int tsk_treeseq_genetic_relatedness_vector(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_windows, const double *windows, tsk_size_t num_focal_nodes, const tsk_id_t *focal_nodes, double *result, tsk_flags_t options) { int ret = 0; bool stat_site = !!(options & TSK_STAT_SITE); bool stat_node = !!(options & TSK_STAT_NODE); tsk_matvec_calculator_t calc; memset(&calc, 0, sizeof(calc)); if (stat_node || stat_site) { ret = tsk_trace_error(TSK_ERR_UNSUPPORTED_STAT_MODE); goto out; } ret = tsk_treeseq_check_windows(self, num_windows, windows, 0); if (ret != 0) { goto out; } ret = tsk_matvec_calculator_init(&calc, self, num_weights, weights, num_windows, windows, num_focal_nodes, focal_nodes, options, result); if (ret != 0) { goto out; } if (options & TSK_DEBUG) { tsk_matvec_calculator_print_state(&calc, tsk_get_debug_stream()); } ret = tsk_matvec_calculator_run(&calc); out: tsk_matvec_calculator_free(&calc); return ret; } ================================================ FILE: c/tskit/trees.h ================================================ /* * MIT License * * Copyright (c) 2019-2024 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /** * @file trees.h * @brief Tskit core tree sequence operations. */ #ifndef TSK_TREES_H #define TSK_TREES_H #ifdef __cplusplus extern "C" { #endif #include // clang-format off /* * These are both undocumented options for tsk_tree_init */ #define TSK_SAMPLE_LISTS (1 << 1) #define TSK_NO_SAMPLE_COUNTS (1 << 2) #define TSK_STAT_SITE (1 << 0) #define TSK_STAT_BRANCH (1 << 1) #define TSK_STAT_NODE (1 << 2) /* Leave room for other stat types */ #define TSK_STAT_POLARISED (1 << 10) #define TSK_STAT_SPAN_NORMALISE (1 << 11) #define TSK_STAT_ALLOW_TIME_UNCALIBRATED (1 << 12) #define TSK_STAT_PAIR_NORMALISE (1 << 13) #define TSK_STAT_NONCENTRED (1 << 14) /* Options for map_mutations */ #define TSK_MM_FIXED_ANCESTRAL_STATE (1 << 0) #define TSK_DIR_FORWARD 1 #define TSK_DIR_REVERSE -1 /** @defgroup API_FLAGS_TS_INIT_GROUP :c:func:`tsk_treeseq_init` specific flags. @{ */ /** If specified edge indexes will be built and stored in the table collection when the tree sequence is initialised. Indexes are required for a valid tree sequence, and are not built by default for performance reasons. */ #define TSK_TS_INIT_BUILD_INDEXES (1 << 0) /** If specified, mutation parents in the table collection will be overwritten with those computed from the topology when the tree sequence is initialised. */ #define TSK_TS_INIT_COMPUTE_MUTATION_PARENTS (1 << 1) /** @} */ // clang-format on /** @brief The tree sequence object. */ typedef struct { tsk_size_t num_trees; tsk_size_t num_samples; tsk_id_t *samples; /* Does this tree sequence have time_units == "uncalibrated" */ bool time_uncalibrated; /* Are all genome coordinates discrete? */ bool discrete_genome; /* Are all time values discrete? */ bool discrete_time; /* Min and max time in node table and mutation table */ double min_time; double max_time; /* Breakpoints along the sequence, including 0 and L. */ double *breakpoints; /* If a node is a sample, map to its index in the samples list */ tsk_id_t *sample_index_map; /* Map individuals to the list of nodes that reference them */ tsk_id_t *individual_nodes_mem; tsk_id_t **individual_nodes; tsk_size_t *individual_nodes_length; /* For each tree, a list of sites on that tree */ tsk_site_t *tree_sites_mem; tsk_site_t **tree_sites; tsk_size_t *tree_sites_length; /* For each site, a list of mutations at that site */ tsk_mutation_t *site_mutations_mem; tsk_mutation_t **site_mutations; tsk_size_t *site_mutations_length; /** @brief The table collection underlying this tree sequence, This table * collection must be treated as read-only, and any changes to it will * lead to undefined behaviour. */ tsk_table_collection_t *tables; } tsk_treeseq_t; typedef struct { tsk_id_t index; struct { double left; double right; } interval; struct { tsk_id_t start; tsk_id_t stop; const tsk_id_t *order; } in; struct { tsk_id_t start; tsk_id_t stop; const tsk_id_t *order; } out; tsk_id_t left_current_index; tsk_id_t right_current_index; int direction; const tsk_treeseq_t *tree_sequence; } tsk_tree_position_t; /** @brief A single tree in a tree sequence. @rst A ``tsk_tree_t`` object has two basic functions: 1. Represent the state of a single tree in a tree sequence; 2. Provide methods to transform this state into different trees in the sequence. The state of a single tree in the tree sequence is represented using the quintuply linked encoding: please see the :ref:`data model ` section for details on how this works. The left-to-right ordering of nodes in this encoding is arbitrary, and may change depending on the order in which trees are accessed within the sequence. Please see the :ref:`sec_c_api_examples_tree_traversals` examples for recommended usage. On initialisation, a tree is in the :ref:`null state` and we must call one of the :ref:`seeking` methods to make the state of the tree object correspond to a particular tree in the sequence. Please see the :ref:`sec_c_api_examples_tree_iteration` examples for recommended usage. @endrst */ typedef struct { /** * @brief The parent tree sequence. */ const tsk_treeseq_t *tree_sequence; /** @brief The ID of the "virtual root" whose children are the roots of the tree. */ tsk_id_t virtual_root; /** @brief The parent of node u is parent[u]. Equal to ``TSK_NULL`` if node u is a root or is not a node in the current tree. */ tsk_id_t *parent; /** @brief The leftmost child of node u is left_child[u]. Equal to ``TSK_NULL`` if node u is a leaf or is not a node in the current tree. */ tsk_id_t *left_child; /** @brief The rightmost child of node u is right_child[u]. Equal to ``TSK_NULL`` if node u is a leaf or is not a node in the current tree. */ tsk_id_t *right_child; /** @brief The sibling to the left of node u is left_sib[u]. Equal to ``TSK_NULL`` if node u has no siblings to its left. */ tsk_id_t *left_sib; /** @brief The sibling to the right of node u is right_sib[u]. Equal to ``TSK_NULL`` if node u has no siblings to its right. */ tsk_id_t *right_sib; /** @brief The number of children of node u is num_children[u]. */ tsk_id_t *num_children; /** @brief Array of edge ids where ``edge[u]`` is the edge that encodes the relationship between the child node ``u`` and its parent. Equal to ``TSK_NULL`` if node ``u`` is a root, virtual root or is not a node in the current tree. */ tsk_id_t *edge; /** @brief The total number of edges defining the topology of this tree. This is equal to the number of tree sequence edges that intersect with the tree's genomic interval. */ tsk_size_t num_edges; /** @brief Left and right coordinates of the genomic interval that this tree covers. The left coordinate is inclusive and the right coordinate exclusive. @rst Example: .. code-block:: c tsk_tree_t tree; int ret; // initialise etc ret = tsk_tree_first(&tree); // Check for error assert(ret == TSK_TREE_OK); printf("Coordinates covered by first tree are left=%f, right=%f\n", tree.interval.left, tree.interval.right); @endrst */ struct { double left; double right; } interval; /** @brief The index of this tree in the tree sequence. @rst This attribute provides the zero-based index of the tree represented by the current state of the struct within the parent tree sequence. For example, immediately after we call ``tsk_tree_first(&tree)``, ``tree.index`` will be zero, and after we call ``tsk_tree_last(&tree)``, ``tree.index`` will be the number of trees - 1 (see :c:func:`tsk_treeseq_get_num_trees`) When the tree is in the null state (immediately after initialisation, or after, e.g., calling :c:func:`tsk_tree_prev` on the first tree) the value of the ``index`` is -1. @endrst */ tsk_id_t index; /* Attributes below are private and should not be used in client code. */ tsk_size_t num_nodes; tsk_flags_t options; tsk_size_t root_threshold; const tsk_id_t *samples; /* These are involved in the optional sample tracking; num_samples counts all samples below a give node, and num_tracked_samples counts those from a specific subset. By default sample counts are tracked and roots maintained. If ``TSK_NO_SAMPLE_COUNTS`` is specified, then neither sample counts or roots are available. */ tsk_size_t *num_samples; tsk_size_t *num_tracked_samples; /* These are for the optional sample list tracking. */ tsk_id_t *left_sample; tsk_id_t *right_sample; tsk_id_t *next_sample; /* The sites on this tree */ const tsk_site_t *sites; tsk_size_t sites_length; /* Counters needed for next() and prev() transformations. */ int direction; tsk_id_t left_index; tsk_id_t right_index; tsk_tree_position_t tree_pos; } tsk_tree_t; /****************************************************************************/ /* Tree sequence.*/ /****************************************************************************/ /** @defgroup TREESEQ_API_GROUP Tree sequence API @{ */ /** @brief Initialises the tree sequence based on the specified table collection. @rst This method will copy the supplied table collection unless :c:macro:`TSK_TAKE_OWNERSHIP` is specified. The table collection will be checked for integrity and index maps built. This must be called before any operations are performed on the tree sequence. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. If specified, TSK_TAKE_OWNERSHIP takes immediate ownership of the tables, regardless of error conditions. **Options** - :c:macro:`TSK_TS_INIT_BUILD_INDEXES` - :c:macro:`TSK_TAKE_OWNERSHIP` (applies to the table collection). @endrst @param self A pointer to an uninitialised tsk_table_collection_t object. @param tables A pointer to a tsk_table_collection_t object. @param options Allocation time options. See above for details. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_init( tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options); /** @brief Load a tree sequence from a file path. @rst Loads the data from the specified file into this tree sequence. The tree sequence is also initialised. The resources allocated must be freed using :c:func:`tsk_treeseq_free` even in error conditions. Works similarly to :c:func:`tsk_table_collection_load` please see that function's documentation for details and options. **Examples** .. code-block:: c int ret; tsk_treeseq_t ts; ret = tsk_treeseq_load(&ts, "data.trees", 0); if (ret != 0) { fprintf(stderr, "Load error:%s\n", tsk_strerror(ret)); exit(EXIT_FAILURE); } @endrst @param self A pointer to an uninitialised tsk_treeseq_t object @param filename A NULL terminated string containing the filename. @param options Bitwise options. See above for details. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_load(tsk_treeseq_t *self, const char *filename, tsk_flags_t options); /** @brief Load a tree sequence from a stream. @rst Loads a tree sequence from the specified file stream. The tree sequence is also initialised. The resources allocated must be freed using :c:func:`tsk_treeseq_free` even in error conditions. Works similarly to :c:func:`tsk_table_collection_loadf` please see that function's documentation for details and options. @endrst @param self A pointer to an uninitialised tsk_treeseq_t object. @param file A FILE stream opened in an appropriate mode for reading (e.g. "r", "r+" or "w+") positioned at the beginning of a tree sequence definition. @param options Bitwise options. See above for details. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t options); /** @brief Write a tree sequence to file. @rst Writes the data from this tree sequence to the specified file. If an error occurs the file path is deleted, ensuring that only complete and well formed files will be written. @endrst @param self A pointer to an initialised tsk_treeseq_t object. @param filename A NULL terminated string containing the filename. @param options Bitwise options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_dump( const tsk_treeseq_t *self, const char *filename, tsk_flags_t options); /** @brief Write a tree sequence to a stream. @rst Writes the data from this tree sequence to the specified FILE stream. Semantics are identical to :c:func:`tsk_treeseq_dump`. Please see the :ref:`sec_c_api_examples_file_streaming` section for an example of how to sequentially dump and load tree sequences from a stream. @endrst @param self A pointer to an initialised tsk_treeseq_t object. @param file A FILE stream opened in an appropriate mode for writing (e.g. "w", "a", "r+" or "w+"). @param options Bitwise options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_dumpf(const tsk_treeseq_t *self, FILE *file, tsk_flags_t options); /** @brief Copies the state of the table collection underlying this tree sequence into the specified destination table collection. @rst By default the method initialises the specified destination table collection. If the destination is already initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid leaking memory. @endrst @param self A pointer to a tsk_treeseq_t object. @param tables A pointer to a tsk_table_collection_t object. If the TSK_NO_INIT option is specified, this must be an initialised table collection. If not, it must be an uninitialised table collection. @param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_copy_tables( const tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options); /** @brief Free the internal memory for the specified tree sequence. @param self A pointer to an initialised tsk_treeseq_t object. @return Always returns 0. */ int tsk_treeseq_free(tsk_treeseq_t *self); /** @brief Print out the state of this tree sequence to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_treeseq_t object. @param out The stream to write the summary to. */ void tsk_treeseq_print_state(const tsk_treeseq_t *self, FILE *out); /** @brief Get the number of nodes @rst Returns the number of nodes in this tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the number of nodes. */ tsk_size_t tsk_treeseq_get_num_nodes(const tsk_treeseq_t *self); /** @brief Get the number of edges @rst Returns the number of edges in this tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the number of edges. */ tsk_size_t tsk_treeseq_get_num_edges(const tsk_treeseq_t *self); /** @brief Get the number of migrations @rst Returns the number of migrations in this tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the number of migrations. */ tsk_size_t tsk_treeseq_get_num_migrations(const tsk_treeseq_t *self); /** @brief Get the number of sites @rst Returns the number of sites in this tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the number of sites. */ tsk_size_t tsk_treeseq_get_num_sites(const tsk_treeseq_t *self); /** @brief Get the number of mutations @rst Returns the number of mutations in this tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the number of mutations. */ tsk_size_t tsk_treeseq_get_num_mutations(const tsk_treeseq_t *self); /** @brief Get the number of provenances @rst Returns the number of provenances in this tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the number of provenances. */ tsk_size_t tsk_treeseq_get_num_provenances(const tsk_treeseq_t *self); /** @brief Get the number of populations @rst Returns the number of populations in this tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the number of populations. */ tsk_size_t tsk_treeseq_get_num_populations(const tsk_treeseq_t *self); /** @brief Get the number of individuals @rst Returns the number of individuals in this tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the number of individuals. */ tsk_size_t tsk_treeseq_get_num_individuals(const tsk_treeseq_t *self); /** @brief Return the number of trees in this tree sequence. @rst This is a constant time operation. @endrst @param self A pointer to a tsk_treeseq_t object. @return The number of trees in the tree sequence. */ tsk_size_t tsk_treeseq_get_num_trees(const tsk_treeseq_t *self); /** @brief Get the number of samples @rst Returns the number of nodes marked as samples in this tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the number of samples. */ tsk_size_t tsk_treeseq_get_num_samples(const tsk_treeseq_t *self); /** @brief Get the top-level tree sequence metadata. @rst Returns a pointer to the metadata string, which is owned by the tree sequence and not null-terminated. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns a pointer to the metadata. */ const char *tsk_treeseq_get_metadata(const tsk_treeseq_t *self); /** @brief Get the length of top-level tree sequence metadata @rst Returns the length of the metadata string. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the length of the metadata. */ tsk_size_t tsk_treeseq_get_metadata_length(const tsk_treeseq_t *self); /** @brief Get the top-level tree sequence metadata schema. @rst Returns a pointer to the metadata schema string, which is owned by the tree sequence and not null-terminated. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns a pointer to the metadata schema. */ const char *tsk_treeseq_get_metadata_schema(const tsk_treeseq_t *self); /** @brief Get the length of the top-level tree sequence metadata schema. @rst Returns the length of the metadata schema string. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the length of the metadata schema. */ tsk_size_t tsk_treeseq_get_metadata_schema_length(const tsk_treeseq_t *self); /** @brief Get the time units string @rst Returns a pointer to the time units string, which is owned by the tree sequence and not null-terminated. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns a pointer to the time units. */ const char *tsk_treeseq_get_time_units(const tsk_treeseq_t *self); /** @brief Get the length of time units string @rst Returns the length of the time units string. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the length of the time units. */ tsk_size_t tsk_treeseq_get_time_units_length(const tsk_treeseq_t *self); /** @brief Get the file uuid @rst Returns a pointer to the null-terminated file uuid string, which is owned by the tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns a pointer to the null-terminated file uuid. */ const char *tsk_treeseq_get_file_uuid(const tsk_treeseq_t *self); /** @brief Get the sequence length @rst Returns the sequence length of this tree sequence @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the sequence length. */ double tsk_treeseq_get_sequence_length(const tsk_treeseq_t *self); /** @brief Get the breakpoints @rst Returns an array of breakpoint locations, the array is owned by the tree sequence. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the pointer to the breakpoint array. */ const double *tsk_treeseq_get_breakpoints(const tsk_treeseq_t *self); /** @brief Get the samples @rst Returns an array of ids of sample nodes in this tree sequence. I.e. nodes that have the :c:macro:`TSK_NODE_IS_SAMPLE` flag set. The array is owned by the tree sequence and should not be modified or free'd. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the pointer to the sample node id array. */ const tsk_id_t *tsk_treeseq_get_samples(const tsk_treeseq_t *self); /** @brief Get the map of node id to sample index @rst Returns the location of each node in the list of samples or :c:macro:`TSK_NULL` for nodes that are not samples. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the pointer to the array of sample indexes. */ const tsk_id_t *tsk_treeseq_get_sample_index_map(const tsk_treeseq_t *self); /** @brief Check if a node is a sample @rst Returns the sample status of a given node id. @endrst @param self A pointer to a tsk_treeseq_t object. @param u The id of the node to be checked. @return Returns true if the node is a sample. */ bool tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u); /** @brief Get the discrete genome status @rst If all the genomic locations in the tree sequence are discrete integer values then this flag will be true. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns true if all genomic locations are discrete. */ bool tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self); /** @brief Get the discrete time status @rst If all times in the tree sequence are discrete integer values then this flag will be true @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns true if all times are discrete. */ bool tsk_treeseq_get_discrete_time(const tsk_treeseq_t *self); /** @brief Get the min time in node table and mutation table @rst The times stored in both the node and mutation tables are considered. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the min time of all nodes and mutations. */ double tsk_treeseq_get_min_time(const tsk_treeseq_t *self); /** @brief Get the max time in node table and mutation table @rst The times stored in both the node and mutation tables are considered. @endrst @param self A pointer to a tsk_treeseq_t object. @return Returns the max time of all nodes and mutations. */ double tsk_treeseq_get_max_time(const tsk_treeseq_t *self); /** @brief Get a node by its index @rst Copies a node from this tree sequence to the specified destination. @endrst @param self A pointer to a tsk_treeseq_t object. @param index The node index to copy @param node A pointer to a tsk_node_t object. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_get_node(const tsk_treeseq_t *self, tsk_id_t index, tsk_node_t *node); /** @brief Get a edge by its index @rst Copies a edge from this tree sequence to the specified destination. @endrst @param self A pointer to a tsk_treeseq_t object. @param index The edge index to copy @param edge A pointer to a tsk_edge_t object. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_get_edge(const tsk_treeseq_t *self, tsk_id_t index, tsk_edge_t *edge); /** @brief Get a edge by its index @rst Copies a migration from this tree sequence to the specified destination. @endrst @param self A pointer to a tsk_treeseq_t object. @param index The migration index to copy @param migration A pointer to a tsk_migration_t object. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_get_migration( const tsk_treeseq_t *self, tsk_id_t index, tsk_migration_t *migration); /** @brief Get a site by its index @rst Copies a site from this tree sequence to the specified destination. @endrst @param self A pointer to a tsk_treeseq_t object. @param index The site index to copy @param site A pointer to a tsk_site_t object. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_get_site(const tsk_treeseq_t *self, tsk_id_t index, tsk_site_t *site); /** @brief Get a mutation by its index @rst Copies a mutation from this tree sequence to the specified destination. @endrst @param self A pointer to a tsk_treeseq_t object. @param index The mutation index to copy @param mutation A pointer to a tsk_mutation_t object. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_get_mutation( const tsk_treeseq_t *self, tsk_id_t index, tsk_mutation_t *mutation); /** @brief Get a provenance by its index @rst Copies a provenance from this tree sequence to the specified destination. @endrst @param self A pointer to a tsk_treeseq_t object. @param index The provenance index to copy @param provenance A pointer to a tsk_provenance_t object. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_get_provenance( const tsk_treeseq_t *self, tsk_id_t index, tsk_provenance_t *provenance); /** @brief Get a population by its index @rst Copies a population from this tree sequence to the specified destination. @endrst @param self A pointer to a tsk_treeseq_t object. @param index The population index to copy @param population A pointer to a tsk_population_t object. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_get_population( const tsk_treeseq_t *self, tsk_id_t index, tsk_population_t *population); /** @brief Get a individual by its index @rst Copies a individual from this tree sequence to the specified destination. @endrst @param self A pointer to a tsk_treeseq_t object. @param index The individual index to copy @param individual A pointer to a tsk_individual_t object. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_get_individual( const tsk_treeseq_t *self, tsk_id_t index, tsk_individual_t *individual); /** @brief Create a simplified instance of this tree sequence @rst Copies this tree sequence to the specified destination and performs simplification. The destination tree sequence should be uninitialised. Simplification transforms the tables to remove redundancy and canonicalise tree sequence data. See the :ref:`simplification ` tutorial for more details. For full details and flags see :c:func:`tsk_table_collection_simplify` which performs the same operation in place. @endrst @param self A pointer to a uninitialised tsk_treeseq_t object. @param samples Either NULL or an array of num_samples distinct and valid node IDs. If non-null the nodes in this array will be marked as samples in the output. If NULL, the num_samples parameter is ignored and the samples in the output will be the same as the samples in the input. This is equivalent to populating the samples array with all of the sample nodes in the input in increasing order of ID. @param num_samples The number of node IDs in the input samples array. Ignored if the samples array is NULL. @param options Simplify options; see above for the available bitwise flags. For the default behaviour, a value of 0 should be provided. @param output A pointer to an uninitialised tsk_treeseq_t object. @param node_map If not NULL, this array will be filled to define the mapping between nodes IDs in the table collection before and after simplification. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_simplify(const tsk_treeseq_t *self, const tsk_id_t *samples, tsk_size_t num_samples, tsk_flags_t options, tsk_treeseq_t *output, tsk_id_t *node_map); /** @brief Extends haplotypes Returns a new tree sequence in which the span covered by ancestral nodes is "extended" to regions of the genome according to the following rule: If an ancestral segment corresponding to node `n` has ancestor `p` and descendant `c` on some portion of the genome, and on an adjacent segment of genome `p` is still an ancestor of `c`, then `n` is inserted into the path from `p` to `c`. For instance, if `p` is the parent of `n` and `n` is the parent of `c`, then the span of the edges from `p` to `n` and `n` to `c` are extended, and the span of the edge from `p` to `c` is reduced. However, any edges whose child node is a sample are not modified. See Fritze et al. (2025): https://doi.org/10.1093/genetics/iyaf198 for more details. The method works by iterating over the genome to look for edges that can be extended in this way; the maximum number of such iterations is controlled by ``max_iter``. The `node` of certain mutations may also be remapped; to do this unambiguously we need to know mutation times. If mutations times are unknown, use `tsk_table_collection_compute_mutation_times` first. The method will not affect any tables except the edge table, or the node column in the mutation table. @rst **Options**: None currently defined. @endrst @param self A pointer to a tsk_treeseq_t object. @param max_iter The maximum number of iterations over the tree sequence. @param options Bitwise option flags. (UNUSED) @param output A pointer to an uninitialised tsk_treeseq_t object. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_extend_haplotypes( const tsk_treeseq_t *self, int max_iter, tsk_flags_t options, tsk_treeseq_t *output); /** @} */ int tsk_treeseq_split_edges(const tsk_treeseq_t *self, double time, tsk_flags_t flags, tsk_id_t population, const char *metadata, tsk_size_t metadata_length, tsk_flags_t options, tsk_treeseq_t *output); bool tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self); /** @brief Decode full-length alignments for specified nodes over an interval. @rst Fills a caller-provided buffer with per-node sequence alignments for the interval ``[left, right)``. Each row is exactly ``L = right - left`` bytes with no trailing terminator, and rows are tightly packed in row-major order in the output buffer. The output at non-site positions comes from the provided ``ref_seq`` slice (``ref_seq[left:right]``); per-site alleles are overlaid onto this for each node. If the :c:macro:`TSK_ISOLATED_NOT_MISSING` option is not set, nodes that are isolated (no parent and no children) within a tree interval in ``[left, right)`` are rendered as the ``missing_data_character`` for that interval. At site positions, decoded genotypes override any previous value; if a genotype is missing (``TSK_MISSING_DATA``), the ``missing_data_character`` is overlaid onto the reference base. Requirements and validation: - The tree sequence must have a discrete genome. - ``left`` and ``right`` must be integers with ``0 <= left < right <= sequence_length``. - ``ref_seq`` must be non-NULL and ``ref_seq_length == sequence_length``. - Each allele at a site must be exactly one byte; alleles equal to ``missing_data_character`` are not permitted. @endrst @param self A pointer to a :c:type:`tsk_treeseq_t` object. @param ref_seq Pointer to a reference sequence buffer of length ``ref_seq_length``. @param ref_seq_length The total length of ``ref_seq``; must equal the tree sequence length. @param nodes Array of node IDs to decode (may include non-samples). @param num_nodes The number of nodes in ``nodes`` and rows in the output. @param left The inclusive-left genomic coordinate of the output interval. @param right The exclusive-right genomic coordinate of the output interval. @param missing_data_character The byte to use for missing data. @param alignments_out Output buffer of size at least ``num_nodes * (right - left)``. @param options Bitwise option flags; supports :c:macro:`TSK_ISOLATED_NOT_MISSING`. @return Return 0 on success or a negative value on failure. */ int tsk_treeseq_decode_alignments(const tsk_treeseq_t *self, const char *ref_seq, tsk_size_t ref_seq_length, const tsk_id_t *nodes, tsk_size_t num_nodes, double left, double right, char missing_data_character, char *alignments_out, tsk_flags_t options); int tsk_treeseq_get_individuals_population(const tsk_treeseq_t *self, tsk_id_t *output); int tsk_treeseq_get_individuals_time(const tsk_treeseq_t *self, double *output); int tsk_treeseq_kc_distance(const tsk_treeseq_t *self, const tsk_treeseq_t *other, double lambda_, double *result); int tsk_treeseq_genealogical_nearest_neighbours(const tsk_treeseq_t *self, const tsk_id_t *focal, tsk_size_t num_focal, const tsk_id_t *const *reference_sets, const tsk_size_t *reference_set_size, tsk_size_t num_reference_sets, tsk_flags_t options, double *ret_array); int tsk_treeseq_mean_descendants(const tsk_treeseq_t *self, const tsk_id_t *const *reference_sets, const tsk_size_t *reference_set_size, tsk_size_t num_reference_sets, tsk_flags_t options, double *ret_array); typedef int general_stat_func_t(tsk_size_t state_dim, const double *state, tsk_size_t result_dim, double *result, void *params); int tsk_treeseq_general_stat(const tsk_treeseq_t *self, tsk_size_t K, const double *W, tsk_size_t M, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); typedef int norm_func_t(tsk_size_t result_dim, const double *hap_weights, tsk_size_t n_a, tsk_size_t n_b, double *result, void *params); int tsk_treeseq_two_locus_count_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t result_dim, const tsk_id_t *set_indexes, general_stat_func_t *f, norm_func_t *norm_f, tsk_size_t out_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t out_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); /* One way weighted stats */ typedef int one_way_weighted_method(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_trait_covariance(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_trait_correlation(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); /* One way weighted stats with covariates */ typedef int one_way_covariates_method(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_covariates, const double *covariates, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_trait_linear_model(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_covariates, const double *covariates, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); /* Two way weighted stats with covariates */ typedef int two_way_weighted_method(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result, tsk_flags_t options); int tsk_treeseq_genetic_relatedness_weighted(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result, tsk_flags_t options); /* One way weighted stats with vector output */ typedef int weighted_vector_method(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_windows, const double *windows, tsk_size_t num_focal_nodes, const tsk_id_t *focal_nodes, double *result, tsk_flags_t options); int tsk_treeseq_genetic_relatedness_vector(const tsk_treeseq_t *self, tsk_size_t num_weights, const double *weights, tsk_size_t num_windows, const double *windows, tsk_size_t num_focal_nodes, const tsk_id_t *focal_nodes, double *result, tsk_flags_t options); /* One way sample set stats */ typedef int one_way_sample_stat_method(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_diversity(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_segregating_sites(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_Y1(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_size_t num_time_windows, const double *time_windows, tsk_flags_t options, double *result); typedef int general_sample_stat_method(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_indexes, const tsk_id_t *indexes, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); typedef int two_locus_count_stat_method(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_D(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_D2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_r2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_D_prime(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_r(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_Dz(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_pi2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_D2_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_Dz_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_pi2_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); typedef int k_way_two_locus_count_stat_method(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); /* Two way sample set stats */ int tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_Y2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_genetic_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_D2_ij(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_D2_ij_unbiased(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); int tsk_treeseq_r2_ij(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_rows, const tsk_id_t *row_sites, const double *row_positions, tsk_size_t num_cols, const tsk_id_t *col_sites, const double *col_positions, tsk_flags_t options, double *result); /* Three way sample set stats */ int tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_f3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); /* Four way sample set stats */ int tsk_treeseq_f4(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); int tsk_treeseq_divergence_matrix(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, tsk_flags_t options, double *result); /* Coalescence rates */ typedef int pair_coalescence_stat_func_t(tsk_size_t input_dim, const double *atoms, const double *weights, tsk_size_t result_dim, double *result, void *params); int tsk_treeseq_pair_coalescence_stat(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes, tsk_size_t num_windows, const double *windows, tsk_size_t num_bins, const tsk_id_t *node_bin_map, pair_coalescence_stat_func_t *summary_func, tsk_size_t summary_func_dim, void *summary_func_args, tsk_flags_t options, double *result); int tsk_treeseq_pair_coalescence_counts(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes, tsk_size_t num_windows, const double *windows, tsk_size_t num_bins, const tsk_id_t *node_bin_map, tsk_flags_t options, double *result); int tsk_treeseq_pair_coalescence_quantiles(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes, tsk_size_t num_windows, const double *windows, tsk_size_t num_bins, const tsk_id_t *node_bin_map, tsk_size_t num_quantiles, double *quantiles, tsk_flags_t options, double *result); int tsk_treeseq_pair_coalescence_rates(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_set_indexes, const tsk_id_t *set_indexes, tsk_size_t num_windows, const double *windows, tsk_size_t num_time_windows, const tsk_id_t *node_time_window, double *time_windows, tsk_flags_t options, double *result); /****************************************************************************/ /* Tree */ /****************************************************************************/ /** @defgroup TREE_API_LIFECYCLE_GROUP Tree lifecycle @{ */ /** @brief Initialises the tree by allocating internal memory and associating with the specified tree sequence. @rst This must be called before any operations are performed on the tree. The specified tree sequence object must be initialised, and must be valid for the full lifetime of this tree. See the :ref:`sec_c_api_overview_structure` for details on how objects are initialised and freed. The ``options`` parameter is provided to support future expansions of the API. A number of undocumented internal features are controlled via this parameter, and it **must** be set to 0 to ensure that operations work as expected and for compatibility with future versions of tskit. @endrst @param self A pointer to an uninitialised tsk_tree_t object. @param tree_sequence A pointer to an initialised tsk_treeseq_t object. @param options Allocation time options. Must be 0, or behaviour is undefined. @return Return 0 on success or a negative value on failure. */ int tsk_tree_init( tsk_tree_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options); /** @brief Free the internal memory for the specified tree. @param self A pointer to an initialised tsk_tree_t object. @return Always returns 0. */ int tsk_tree_free(tsk_tree_t *self); /** @brief Copies the state of this tree into the specified destination. @rst By default (``options`` = 0) the method initialises the specified destination tree by calling :c:func:`tsk_tree_init`. If the destination is already initialised, the :c:macro:`TSK_NO_INIT` option should be supplied to avoid leaking memory. If :c:macro:`TSK_NO_INIT` is supplied and the tree sequence associated with the ``dest`` tree is not equal to the tree sequence associated with ``self``, an error is raised. The destination tree will keep a reference to the tree sequence object associated with the source tree, and this tree sequence must be valid for the full lifetime of the destination tree. **Options** - :c:macro:`TSK_NO_INIT` If :c:macro:`TSK_NO_INIT` is not specified, options for :c:func:`tsk_tree_init` can be provided and will be passed on. @endrst @param self A pointer to an initialised tsk_tree_t object. @param dest A pointer to a tsk_tree_t object. If the TSK_NO_INIT option is specified, this must be an initialised tree. If not, it must be an uninitialised tree. @param options Copy and allocation time options. See the notes above for details. @return Return 0 on success or a negative value on failure. */ int tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options); /** @} */ /** @defgroup TREE_API_SEEKING_GROUP Seeking along the sequence @{ */ /** @brief Option to seek by skipping to the target tree, adding and removing as few edges as possible. If not specified, a linear time algorithm is used instead. @ingroup TREE_API_SEEKING_GROUP */ #define TSK_SEEK_SKIP (1 << 0) /** @brief Seek to the first tree in the sequence. @rst Set the state of this tree to reflect the first tree in parent tree sequence. @endrst @param self A pointer to an initialised tsk_tree_t object. @return Return TSK_TREE_OK on success; or a negative value if an error occurs. */ int tsk_tree_first(tsk_tree_t *self); /** @brief Seek to the last tree in the sequence. @rst Set the state of this tree to reflect the last tree in parent tree sequence. @endrst @param self A pointer to an initialised tsk_tree_t object. @return Return TSK_TREE_OK on success; or a negative value if an error occurs. */ int tsk_tree_last(tsk_tree_t *self); /** @brief Seek to the next tree in the sequence. @rst Set the state of this tree to reflect the next tree in parent tree sequence. If the index of the current tree is ``j``, then the after this operation the index will be ``j + 1``. Calling :c:func:`tsk_tree_next` a tree in the :ref:`null state` is equivalent to calling :c:func:`tsk_tree_first`. Calling :c:func:`tsk_tree_next` on the last tree in the sequence will transform it into the :ref:`null state` (equivalent to calling :c:func:`tsk_tree_clear`). Please see the :ref:`sec_c_api_examples_tree_iteration` examples for recommended usage. @endrst @param self A pointer to an initialised tsk_tree_t object. @return Return TSK_TREE_OK on successfully transforming to a non-null tree; 0 on successfully transforming into the null tree; or a negative value if an error occurs. */ int tsk_tree_next(tsk_tree_t *self); /** @brief Seek to the previous tree in the sequence. @rst Set the state of this tree to reflect the previous tree in parent tree sequence. If the index of the current tree is ``j``, then the after this operation the index will be ``j - 1``. Calling :c:func:`tsk_tree_prev` a tree in the :ref:`null state` is equivalent to calling :c:func:`tsk_tree_last`. Calling :c:func:`tsk_tree_prev` on the first tree in the sequence will transform it into the :ref:`null state` (equivalent to calling :c:func:`tsk_tree_clear`). Please see the :ref:`sec_c_api_examples_tree_iteration` examples for recommended usage. @endrst @param self A pointer to an initialised tsk_tree_t object. @return Return TSK_TREE_OK on successfully transforming to a non-null tree; 0 on successfully transforming into the null tree; or a negative value if an error occurs. */ int tsk_tree_prev(tsk_tree_t *self); /** @brief Set the tree into the null state. @rst Transform this tree into the :ref:`null state`. @endrst @param self A pointer to an initialised tsk_tree_t object. @return Return 0 on success or a negative value on failure. */ int tsk_tree_clear(tsk_tree_t *self); /** @brief Seek to a particular position on the genome. @rst Set the state of this tree to reflect the tree in parent tree sequence covering the specified ``position``. That is, on success we will have ``tree.interval.left <= position`` and we will have ``position < tree.interval.right``. Seeking to a position currently covered by the tree is a constant time operation. Seeking to a position from a non-null tree uses a linear time algorithm by default, unless the option :c:macro:`TSK_SEEK_SKIP` is specified. In this case, a faster algorithm is employed which skips to the target tree by removing and adding the minimal number of edges possible. However, this approach does not guarantee that edges are inserted and removed in time-sorted order. .. warning:: Using the :c:macro:`TSK_SEEK_SKIP` option may lead to edges not being inserted or removed in time-sorted order. @endrst @param self A pointer to an initialised tsk_tree_t object. @param position The position in genome coordinates @param options Seek options. See the notes above for details. @return Return 0 on success or a negative value on failure. */ int tsk_tree_seek(tsk_tree_t *self, double position, tsk_flags_t options); /** @brief Seek to a specific tree in a tree sequence. @rst Set the state of this tree to reflect the tree in parent tree sequence whose index is ``0 <= tree < num_trees``. @endrst @param self A pointer to an initialised tsk_tree_t object. @param tree The target tree index. @param options Seek options. Currently unused. Set to 0 for compatibility with future versions of tskit. @return Return 0 on success or a negative value on failure. */ int tsk_tree_seek_index(tsk_tree_t *self, tsk_id_t tree, tsk_flags_t options); /** @} */ /** @defgroup TREE_API_TREE_QUERY_GROUP Tree Queries @{ */ /** @brief Returns the number of roots in this tree. @rst See the :ref:`sec_data_model_tree_roots` section for more information on how the roots of a tree are defined. @endrst @param self A pointer to an initialised tsk_tree_t object. @return Returns the number roots in this tree. */ tsk_size_t tsk_tree_get_num_roots(const tsk_tree_t *self); /** @brief Returns the leftmost root in this tree. @rst See the :ref:`sec_data_model_tree_roots` section for more information on how the roots of a tree are defined. This function is equivalent to ``tree.left_child[tree.virtual_root]``. @endrst @param self A pointer to an initialised tsk_tree_t object. @return Returns the leftmost root in the tree. */ tsk_id_t tsk_tree_get_left_root(const tsk_tree_t *self); /** @brief Returns the rightmost root in this tree. @rst See the :ref:`sec_data_model_tree_roots` section for more information on how the roots of a tree are defined. This function is equivalent to ``tree.right_child[tree.virtual_root]``. @endrst @param self A pointer to an initialised tsk_tree_t object. @return Returns the rightmost root in the tree. */ tsk_id_t tsk_tree_get_right_root(const tsk_tree_t *self); /** @brief Get the list of sites for this tree. @rst Gets the list of :c:data:`tsk_site_t` objects in the parent tree sequence for which the position lies within this tree's genomic interval. The memory pointed to by the ``sites`` parameter is managed by the ``tsk_tree_t`` object and must not be altered or freed by client code. .. code-block:: c static void print_sites(const tsk_tree_t *tree) { int ret; tsk_size_t j, num_sites; const tsk_site_t *sites; ret = tsk_tree_get_sites(tree, &sites, &num_sites); check_tsk_error(ret); for (j = 0; j < num_sites; j++) { printf("position = %f\n", sites[j].position); } } This is a constant time operation. @endrst @param self A pointer to a tsk_tree_t object. @param sites The destination pointer for the list of sites. @param sites_length A pointer to a tsk_size_t value in which the number of sites is stored. @return 0 on success or a negative value on failure. */ int tsk_tree_get_sites( const tsk_tree_t *self, const tsk_site_t **sites, tsk_size_t *sites_length); /** @brief Return an upper bound on the number of nodes reachable from the roots of this tree. @rst This function provides an upper bound on the number of nodes that can be reached in tree traversals, and is intended to be used for memory allocation purposes. If ``num_nodes`` is the number of nodes visited in a tree traversal from the :ref:`virtual root` (e.g., ``tsk_tree_preorder_from(tree, tree->virtual_root, nodes, &num_nodes)``), the bound ``N`` returned here is guaranteed to be greater than or equal to ``num_nodes``. .. warning:: The precise value returned is not defined and should not be depended on, as it may change from version-to-version. @endrst @param self A pointer to a tsk_tree_t object. @return An upper bound on the number nodes reachable from the roots of this tree, or zero if this tree has not been initialised. */ tsk_size_t tsk_tree_get_size_bound(const tsk_tree_t *self); /** @brief Print out the state of this tree to the specified stream. This method is intended for debugging purposes and should not be used in production code. The format of the output should **not** be depended on and may change arbitrarily between versions. @param self A pointer to a tsk_tree_t object. @param out The stream to write the summary to. */ void tsk_tree_print_state(const tsk_tree_t *self, FILE *out); /** @} */ /** @defgroup TREE_API_NODE_QUERY_GROUP Node Queries @{ */ /** @brief Returns the parent of the specified node. @rst Equivalent to ``tree.parent[u]`` with bounds checking for the node u. Performance sensitive code which can guarantee that the node u is valid should use the direct array access in preference to this method. @endrst @param self A pointer to a tsk_tree_t object. @param u The tree node. @param parent A tsk_id_t pointer to store the returned parent node. @return 0 on success or a negative value on failure. */ int tsk_tree_get_parent(const tsk_tree_t *self, tsk_id_t u, tsk_id_t *parent); /** @brief Returns the time of the specified node. @rst Equivalent to ``tables->nodes.time[u]`` with bounds checking for the node u. Performance sensitive code which can guarantee that the node u is valid should use the direct array access in preference to this method, for example: .. code-block:: c static void print_times(const tsk_tree_t *tree) { int ret; tsk_size_t num_nodes, j; const double *node_time = tree->tree_sequence->tables->nodes.time; tsk_id_t *nodes = malloc(tsk_tree_get_size_bound(tree) * sizeof(*nodes)); if (nodes == NULL) { errx(EXIT_FAILURE, "Out of memory"); } ret = tsk_tree_preorder(tree, nodes, &num_nodes); check_tsk_error(ret); for (j = 0; j < num_nodes; j++) { printf("time = %f\n", node_time[nodes[j]]); } free(nodes); } @endrst @param self A pointer to a tsk_tree_t object. @param u The tree node. @param ret_time A double pointer to store the returned node time. @return 0 on success or a negative value on failure. */ int tsk_tree_get_time(const tsk_tree_t *self, tsk_id_t u, double *ret_time); /** @brief Return number of nodes on the path from the specified node to root. @rst Return the number of nodes on the path from u to root, not including u. The depth of a root is therefore zero. As a special case, the depth of the :ref:`virtual root ` is defined as -1. @endrst @param self A pointer to a tsk_tree_t object. @param u The tree node. @param ret_depth An int pointer to store the returned node depth. @return 0 on success or a negative value on failure. */ int tsk_tree_get_depth(const tsk_tree_t *self, tsk_id_t u, int *ret_depth); /** @brief Return the length of the branch ancestral to the specified node. @rst Return the length of the branch ancestral to the specified node. Branch length is defined as difference between the time of a node and its parent. The branch length of a root is zero. @endrst @param self A pointer to a tsk_tree_t object. @param u The tree node. @param ret_branch_length A double pointer to store the returned branch length. @return 0 on success or a negative value on failure. */ int tsk_tree_get_branch_length( const tsk_tree_t *self, tsk_id_t u, double *ret_branch_length); /** @brief Computes the sum of the lengths of all branches reachable from the specified node, or from all roots if ``u=TSK_NULL``. @rst Return the total branch length in a particular subtree or of the entire tree. If the specified node is :c:macro:`TSK_NULL` (or the :ref:`virtual root`) the sum of the lengths of all branches reachable from roots is returned. Branch length is defined as difference between the time of a node and its parent. The branch length of a root is zero. Note that if the specified node is internal its branch length is *not* included, so that, e.g., the total branch length of a leaf node is zero. @endrst @param self A pointer to a tsk_tree_t object. @param u The root of the subtree of interest, or ``TSK_NULL`` to return the total branch length of the tree. @param ret_tbl A double pointer to store the returned total branch length. @return 0 on success or a negative value on failure. */ int tsk_tree_get_total_branch_length( const tsk_tree_t *self, tsk_id_t u, double *ret_tbl); /** @brief Counts the number of samples in the subtree rooted at a node. @rst Returns the number of samples descending from a particular node, including the node itself. This is a constant time operation. @endrst @param self A pointer to a tsk_tree_t object. @param u The tree node. @param ret_num_samples A tsk_size_t pointer to store the returned number of samples. @return 0 on success or a negative value on failure. */ int tsk_tree_get_num_samples( const tsk_tree_t *self, tsk_id_t u, tsk_size_t *ret_num_samples); /** @brief Compute the most recent common ancestor of two nodes. @rst If two nodes do not share a common ancestor in the current tree, the MRCA node is :c:macro:`TSK_NULL`. @endrst @param self A pointer to a tsk_tree_t object. @param u A tree node. @param v A tree node. @param mrca A tsk_id_t pointer to store the returned most recent common ancestor node. @return 0 on success or a negative value on failure. */ int tsk_tree_get_mrca(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v, tsk_id_t *mrca); /** @brief Returns true if u is a descendant of v. @rst Returns true if u and v are both valid nodes in the tree sequence and v lies on the path from u to root, and false otherwise. Any node is a descendant of itself. @endrst @param self A pointer to a tsk_tree_t object. @param u The descendant node. @param v The ancestral node. @return true if u is a descendant of v, and false otherwise. */ bool tsk_tree_is_descendant(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v); /** @} */ /** @defgroup TREE_API_TRAVERSAL_GROUP Traversal orders. @{ */ /** @brief Fill an array with the nodes of this tree in preorder. @rst Populate an array with the nodes in this tree in preorder. The array must be pre-allocated and be sufficiently large to hold the array of nodes visited. The recommended approach is to use the :c:func:`tsk_tree_get_size_bound` function, as in the following example: .. code-block:: c static void print_preorder(tsk_tree_t *tree) { int ret; tsk_size_t num_nodes, j; tsk_id_t *nodes = malloc(tsk_tree_get_size_bound(tree) * sizeof(*nodes)); if (nodes == NULL) { errx(EXIT_FAILURE, "Out of memory"); } ret = tsk_tree_preorder(tree, nodes, &num_nodes); check_tsk_error(ret); for (j = 0; j < num_nodes; j++) { printf("Visit preorder %lld\n", (long long) nodes[j]); } free(nodes); } .. seealso:: See the :ref:`sec_c_api_examples_tree_traversals` section for more examples. @endrst @param self A pointer to a tsk_tree_t object. @param nodes The tsk_id_t array to store nodes in. See notes above for details. @param num_nodes A pointer to a tsk_size_t value where we store the number of nodes in the traversal. @return 0 on success or a negative value on failure. */ int tsk_tree_preorder(const tsk_tree_t *self, tsk_id_t *nodes, tsk_size_t *num_nodes); /** @brief Fill an array with the nodes of this tree starting from a particular node. @rst As for :c:func:`tsk_tree_preorder` but starting the traversal at a particular node (which will be the first node in the traversal list). The :ref:`virtual root` is a valid input for this function and will be treated like any other tree node. The value ``-1`` is a special case, in which we visit all nodes reachable from the roots, and equivalent to calling :c:func:`tsk_tree_preorder`. See :c:func:`tsk_tree_preorder` for details the requirements for the ``nodes`` array. @endrst @param self A pointer to a tsk_tree_t object. @param root The root of the subtree to traverse, or -1 to visit all nodes. @param nodes The tsk_id_t array to store nodes in. @param num_nodes A pointer to a tsk_size_t value where we store the number of nodes in the traversal. @return 0 on success or a negative value on failure. */ int tsk_tree_preorder_from( const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes); /** @brief Fill an array with the nodes of this tree in postorder. @rst Populate an array with the nodes in this tree in postorder. The array must be pre-allocated and be sufficiently large to hold the array of nodes visited. The recommended approach is to use the :c:func:`tsk_tree_get_size_bound` function, as in the following example: .. code-block:: c static void print_postorder(tsk_tree_t *tree) { int ret; tsk_size_t num_nodes, j; tsk_id_t *nodes = malloc(tsk_tree_get_size_bound(tree) * sizeof(*nodes)); if (nodes == NULL) { errx(EXIT_FAILURE, "Out of memory"); } ret = tsk_tree_postorder(tree, nodes, &num_nodes); check_tsk_error(ret); for (j = 0; j < num_nodes; j++) { printf("Visit postorder %lld\n", (long long) nodes[j]); } free(nodes); } .. seealso:: See the :ref:`sec_c_api_examples_tree_traversals` section for more examples. @endrst @param self A pointer to a tsk_tree_t object. @param nodes The tsk_id_t array to store nodes in. See notes above for details. @param num_nodes A pointer to a tsk_size_t value where we store the number of nodes in the traversal. @return 0 on success or a negative value on failure. */ int tsk_tree_postorder(const tsk_tree_t *self, tsk_id_t *nodes, tsk_size_t *num_nodes); /** @brief Fill an array with the nodes of this tree starting from a particular node. @rst As for :c:func:`tsk_tree_postorder` but starting the traversal at a particular node (which will be the last node in the traversal list). The :ref:`virtual root` is a valid input for this function and will be treated like any other tree node. The value ``-1`` is a special case, in which we visit all nodes reachable from the roots, and equivalent to calling :c:func:`tsk_tree_postorder`. See :c:func:`tsk_tree_postorder` for details the requirements for the ``nodes`` array. @endrst @param self A pointer to a tsk_tree_t object. @param root The root of the subtree to traverse, or -1 to visit all nodes. @param nodes The tsk_id_t array to store nodes in. See :c:func:`tsk_tree_postorder` for more details. @param num_nodes A pointer to a tsk_size_t value where we store the number of nodes in the traversal. @return 0 on success or a negative value on failure. */ int tsk_tree_postorder_from( const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes); /** @} */ /* Undocumented for now */ int tsk_tree_preorder_samples_from( const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes); int tsk_tree_set_root_threshold(tsk_tree_t *self, tsk_size_t root_threshold); tsk_size_t tsk_tree_get_root_threshold(const tsk_tree_t *self); bool tsk_tree_has_sample_counts(const tsk_tree_t *self); bool tsk_tree_has_sample_lists(const tsk_tree_t *self); int tsk_tree_get_num_tracked_samples( const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_tracked_samples); int tsk_tree_set_tracked_samples( tsk_tree_t *self, tsk_size_t num_tracked_samples, const tsk_id_t *tracked_samples); int tsk_tree_track_descendant_samples(tsk_tree_t *self, tsk_id_t node); typedef struct { tsk_id_t node; tsk_id_t parent; int32_t state; } tsk_state_transition_t; int tsk_tree_map_mutations(tsk_tree_t *self, int32_t *genotypes, double *cost_matrix, tsk_flags_t options, int32_t *ancestral_state, tsk_size_t *num_transitions, tsk_state_transition_t **transitions); int tsk_tree_kc_distance( const tsk_tree_t *self, const tsk_tree_t *other, double lambda, double *result); /* Don't document these balance metrics for now so it doesn't get in the way of * C API 1.0, but should be straightforward to document based on Python docs. */ int tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result); int tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result); int tsk_tree_b1_index(const tsk_tree_t *self, double *result); /* NOTE: if we document this as part of the C API we'll have to be more careful * about the error behaviour on bad log bases. At the moment we're just returning * the resulting value which can be nan, inf etc, but some surprising results * happen like a base 0 seems to return a finite value. */ int tsk_tree_b2_index(const tsk_tree_t *self, double base, double *result); int tsk_tree_num_lineages(const tsk_tree_t *self, double t, tsk_size_t *result); /* Things to consider removing: */ /* This is redundant, really */ bool tsk_tree_is_sample(const tsk_tree_t *self, tsk_id_t u); /* Not terribly useful, since the definition is * return (self->tree_sequence == other->tree_sequence) && (self->index == other->index) * Remove? */ bool tsk_tree_equals(const tsk_tree_t *self, const tsk_tree_t *other); int tsk_tree_position_init( tsk_tree_position_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options); int tsk_tree_position_free(tsk_tree_position_t *self); int tsk_tree_position_print_state(const tsk_tree_position_t *self, FILE *out); bool tsk_tree_position_next(tsk_tree_position_t *self); bool tsk_tree_position_prev(tsk_tree_position_t *self); int tsk_tree_position_seek_forward(tsk_tree_position_t *self, tsk_id_t index); int tsk_tree_position_seek_backward(tsk_tree_position_t *self, tsk_id_t index); #ifdef __cplusplus } #endif #endif ================================================ FILE: c/tskit.h ================================================ /* * MIT License * * Copyright (c) 2019-2024 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /** * @file tskit.h * @brief Tskit API. */ #ifndef __TSKIT_H__ #define __TSKIT_H__ #include #include #include #include #include #include #endif ================================================ FILE: codecov.yml ================================================ ignore: - "c/tests/" - "c/subprojects/**/*" codecov: require_ci_to_pass: false comment: layout: "header, diff, flags, components" # show component info in the PR comment component_management: individual_components: - component_id: python_code name: Python API paths: - python/tskit/*.py - component_id: python_c_code name: Python C interface paths: - python/_tskitmodule.c - python/lwt_interface/tskit_lwt_interface.h - component_id: c_code name: C library paths: - c/tskit ================================================ FILE: docs/.gitignore ================================================ _build doxygen/xml ================================================ FILE: docs/Makefile ================================================ BUILDDIR = _build DOXYGEN_XML = doxygen/xml all: ${DOXYGEN_XML} ./build.sh ${DOXYGEN_XML}: ../c/tskit/*.h cd doxygen && doxygen clean: rm -fR $(BUILDDIR) $(DOXYGEN_XML) ================================================ FILE: docs/_config.yml ================================================ # Book settings # Learn more at https://jupyterbook.org/customize/config.html title: Tskit manual author: Tskit Developers copyright: "2022" only_build_toc_files: true logo: logo.svg favicon: favicon.ico execute: execute_notebooks: cache timeout: 120 launch_buttons: binderhub_url: "" repository: url: https://github.com/tskit-dev/tskit branch: main path_to_book: docs html: use_issues_button: true use_repository_button: true use_edit_page_button: true sphinx: extra_extensions: - sphinx_copybutton - breathe - sphinx.ext.autodoc - sphinx_autodoc_typehints - sphinx.ext.autosummary - sphinx.ext.todo - sphinx.ext.viewcode - sphinx.ext.intersphinx - sphinx_issues - sphinxarg.ext - IPython.sphinxext.ipython_console_highlighting #- sphinxcontrib.prettyspecialmethods config: html_theme: sphinx_book_theme html_theme_options: pygments_dark_style: monokai navigation_with_keys: false logo: text: "Version __PKG_VERSION__" repository_url: https://github.com/tskit-dev/tskit repository_branch: main path_to_docs: docs use_repository_button: true use_edit_page_button: true use_issues_button: true pygments_style: monokai myst_enable_extensions: - colon_fence - deflist - dollarmath - substitution issues_github_path: tskit-dev/tskit todo_include_todos: true intersphinx_mapping: python: ["https://docs.python.org/3/", null] tutorials: ["https://tskit.dev/tutorials/", null] stdpopsim: ["https://stdpopsim.readthedocs.io/en/stable", null] msprime: ["https://tskit.dev/msprime/docs/stable/", null] numpy: ["https://numpy.org/doc/stable/", null] breathe_projects: {"tskit": "doxygen/xml"} breathe_default_project: "tskit" breathe_domain_by_extension: {"h": "c"} breathe_show_define_initializer: True # Note we have to use the regex version here because of # https://github.com/sphinx-doc/sphinx/issues/9748 nitpick_ignore_regex: [ ["c:identifier", "uint8_t"], ["c:identifier", "int32_t"], ["c:identifier", "uint32_t"], ["c:identifier", "uint64_t"], ["c:identifier", "FILE"], ["c:identifier", "bool"], # This is for the anonymous interval struct embedded in the tsk_tree_t. ["c:identifier", "tsk_tree_t.@4"], ["c:type", "int32_t"], ["c:type", "uint32_t"], ["c:type", "uint64_t"], ["c:type", "bool"], # TODO these have been triaged here to make the docs compile, but we should # sort them out properly. https://github.com/tskit-dev/tskit/issues/336 ["py:class", "array_like"], ["py:class", "row-like"], ["py:class", "array-like"], ["py:class", "dtype=np.uint32"], ["py:class", "dtype=np.uint32."], ["py:class", "dtype=np.int32"], ["py:class", "dtype=np.int8"], ["py:class", "dtype=np.float64"], ["py:class", "dtype=np.int64"], ] # Added to allow "bool" be used as a :ctype: - this list has to be # manually specifed in order to remove "bool" from it. c_extra_keywords: [ "alignas", "alignof", "complex", "imaginary", "noreturn", "static_assert", "thread_local" ] autodoc_member_order: bysource # Without this option, autodoc tries to put links for all return types # in terms of the fully-qualified classnames which we don't want, and also # leads to broken links and nitpick failures. So, until we tackle # typehints fully, this is the simplest approach. autodoc_typehints: none ================================================ FILE: docs/_static/README ================================================ Placeholder file to keep git happy. ================================================ FILE: docs/_static/bespoke.css ================================================ /* When a code cell outputs tskit tables in plain text, widen the tab size so column contents line up. Invoke this by adding :tags:["output-wide-tabs"] to the cell */ .tag_output-wide-tabs .cell_output pre {tab-size: 16} ================================================ FILE: docs/_toc.yml ================================================ format: jb-book root: introduction parts: - caption: Getting started chapters: - file: installation - file: quickstart - caption: Concepts chapters: - file: glossary - file: data-model - file: metadata - file: provenance - caption: Analysis chapters: - file: stats - file: topological-analysis - file: ibd - file: export - caption: Interfaces chapters: - file: python-api - file: numba - file: c-api - file: cli - file: file-formats - caption: For developers chapters: - file: development - file: changelogs - caption: Miscellaneous chapters: - file: citation ================================================ FILE: docs/build.sh ================================================ #/bin/bash # Jupyter-build doesn't have an option to automatically show the # saved reports, which makes it difficult to debug the reasons for # build failures in CI. This is a simple wrapper to handle that. REPORTDIR=_build/html/reports uv run --project=../python --group docs jupyter-book build . -vnW --keep-going RETVAL=$? if [ $RETVAL -ne 0 ]; then if [ -e $REPORTDIR ]; then echo "Error occured; showing saved reports" cat $REPORTDIR/* fi else # Clear out any old reports rm -f $REPORTDIR/* fi exit $RETVAL ================================================ FILE: docs/c-api.rst ================================================ .. _sec_c_api: ===== C API ===== This is the documentation for the ``tskit`` C API, a low-level library for manipulating and processing :ref:`tree sequence data `. The library is written using the C99 standard and is fully thread safe. Tskit uses `kastore `_ to define a simple storage format for the tree sequence data. To see the API in action, please see :ref:`sec_c_api_examples` section. ******** Overview ******** -------------------- Do I need the C API? -------------------- The ``tskit`` C API is generally useful in the following situations: - You want to use the ``tskit`` API in a larger C/C++ application (e.g., in order to output data in the ``.trees`` format); - You need to perform lots of tree traversals/loops etc. to analyse some data that is in tree sequence form. For high level operations that are not performance sensitive, the :ref:`sec_python_api` is generally more useful. Python is *much* more convenient that C, and since the ``tskit`` Python module is essentially a wrapper for this C library, there's often no real performance penalty for using it. ------------------------------- Differences with the Python API ------------------------------- Much of the explanatory material (for example tutorials) about the Python API applies to the C-equivalent methods as the Python API wraps this API. The main area of difference is, unlike the Python API, the C API doesn't do any decoding, encoding or schema validation of :ref:`sec_metadata` fields, instead only handling the byte string representation of the metadata. Metadata is therefore never used directly by any tskit C API method, just stored. ---------------------- API stability contract ---------------------- Since the C API 1.0 release we pledge to make **no** breaking changes to the documented API in subsequent releases in the 1.0 series. What this means is that any code that compiles under the 1.0 release should also compile without changes in subsequent 1.x releases. We will not change the semantics of documented functions, unless it is to fix clearly buggy behaviour. We will not change the values of macro constants. Undocumented functions do not have this guarantee, and may be changed arbitrarily between releases. .. note:: We do not currently make any guarantees about `ABI `__ stability, since the primary use-case is for tskit to be embedded within another application rather than used as a shared library. If you do intend to use tskit as a shared library and ABI stability is therefore important to you, please let us know and we can plan accordingly. .. _sec_c_api_overview_structure: ------------- API structure ------------- Tskit uses a set of conventions to provide a pseudo object-oriented API. Each 'object' is represented by a C struct and has a set of 'methods'. This is most easily explained by an example: .. literalinclude:: ../c/examples/api_structure.c :language: c In this program we create a :c:type:`tsk_edge_table_t` instance, add five rows using :c:func:`tsk_edge_table_add_row`, print out its contents using the :c:func:`tsk_edge_table_print_state` debugging method, and finally free the memory used by the edge table object. We define this edge table 'class' by using some simple naming conventions which are adhered to throughout ``tskit``. This is simply a naming convention that helps to keep code written in plain C logically structured; there are no extra C++ style features. We use object oriented terminology freely throughout this documentation with this understanding. In this convention, a class is defined by a struct ``tsk_class_name_t`` (e.g. ``tsk_edge_table_t``) and its methods all have the form ``tsk_class_name_method_name`` whose first argument is always a pointer to an instance of the class (e.g., ``tsk_edge_table_add_row`` above). Each class has an initialise and free method, called ``tsk_class_name_init`` and ``tsk_class_name_free``, respectively. The init method must be called to ensure that the object is correctly initialised (except for functions such as for :c:func:`tsk_table_collection_load` and :c:func:`tsk_table_collection_copy` which automatically initialise the object by default for convenience). The free method must always be called to avoid leaking memory, even in the case of an error occurring during initialisation. If ``tsk_class_name_init`` has been called successfully, we say the object has been "initialised"; if not, it is "uninitialised". After ``tsk_class_name_free`` has been called, the object is again uninitialised. It is important to note that the init methods only allocate *internal* memory; the memory for the instance itself must be allocated either on the heap or the stack: .. code-block:: c // Instance allocated on the stack tsk_node_table_t nodes; tsk_node_table_init(&nodes, 0); tsk_node_table_free(&nodes); // Instance allocated on the heap tsk_edge_table_t *edges = malloc(sizeof(tsk_edge_table_t)); tsk_edge_table_init(edges, 0); tsk_edge_table_free(edges); free(edges); .. _sec_c_api_error_handling: -------------- Error handling -------------- C does not have a mechanism for propagating exceptions, and great care must be taken to ensure that errors are correctly and safely handled. The convention adopted in ``tskit`` is that every function (except for trivial accessor methods) returns an integer. If this return value is negative an error has occured which must be handled. A description of the error that occured can be obtained using the :c:func:`tsk_strerror` function. The following example illustrates the key conventions around error handling in ``tskit``: .. literalinclude:: ../c/examples/error_handling.c :language: c In this example we load a tree sequence from file and print out a summary of the number of nodes and edges it contains. After calling :c:func:`tsk_treeseq_load` we check the return value ``ret`` to see if an error occured. If an error has occured we exit with an error message produced by :c:func:`tsk_strerror`. Note that in this example we call :c:func:`tsk_treeseq_free` whether or not an error occurs: in general, once a function that initialises an object (e.g., ``X_init``, ``X_copy`` or ``X_load``) is called, then ``X_free`` must be called to ensure that memory is not leaked. Most functions in ``tskit`` return an error status; we recommend that **every** return value is checked. .. _sec_c_api_memory_allocation_strategy: -------------------------- Memory allocation strategy -------------------------- To reduce the frequency of memory allocations tskit pre-allocates space for additional table rows in each table, along with space for the contents of ragged columns. The default behaviour is to start with space for 1,024 rows in each table and 65,536 bytes in each ragged column. The table then grows as needed by doubling, until a maximum pre-allocation of 2,097,152 rows for a table or 104,857,600 bytes for a ragged column. This behaviour can be disabled and a fixed increment used, on a per-table and per-ragged-column basis using the ``tsk_X_table_set_max_rows_increment`` and ``tsk_provenance_table_set_max_X_length_increment`` methods where ``X`` is the name of the table or column. --------------------------- Using tskit in your project --------------------------- Tskit is built as a standard C library and so there are many different ways in which it can be included in downstream projects. It is possible to install ``tskit`` onto a system (i.e., installing a shared library and header files to a standard locations on Unix) and linking against it, but there are many different ways in which this can go wrong. In the interest of simplicity and improving the end-user experience we recommend embedding ``tskit`` directly into your applications. There are many different build systems and approaches to compiling code, and so it's not possible to give definitive documentation on how ``tskit`` should be included in downstream projects. Please see the `build examples `_ repo for some examples of how to incorporate ``tskit`` into different project structures and build systems. Tskit uses the `meson `_ build system internally, and supports being used a `meson subproject `_. We show an `example `_ in which this is combined with the tskit distribution tarball to neatly abstract many details of cross-platform C development. Some users may choose to check the source for ``tskit`` directly into their source control repositories. If you wish to do this, the code is in the ``c`` subdirectory of the `tskit `_ repo. The following header files should be placed in the search path: ``subprojects/kastore/kastore.h``, ``tskit.h``, and ``tskit/*.h``. The C files ``subprojects/kastore/kastore.c`` and ``tskit/*.c`` should be compiled. For those who wish to minimise the size of their compiled binaries, ``tskit`` is quite modular, and C files can be omitted if not needed. For example, if you are just using the :ref:`sec_c_api_tables_api` then only the files ``tskit/core.[c,h]`` and ``tskit/tables.[c,h]`` are needed. However you include ``tskit`` in your project, however, please ensure that it is a **released version**. Released versions are tagged on GitHub using the convention ``C_{VERSION}``. The code can either be downloaded from GitHub on the `releases page `_ where each release has a distribution tarball for example https://github.com/tskit-dev/tskit/releases/download/C_1.0.0/tskit-1.0.0.tar.xz Alternatively the code can be checked out using git. For example, to check out the ``C_1.0.0`` release:: $ git clone https://github.com/tskit-dev/tskit.git $ cd tskit $ git checkout C_1.0.0 *********** Basic Types *********** .. doxygentypedef:: tsk_id_t .. doxygentypedef:: tsk_size_t .. doxygentypedef:: tsk_flags_t .. doxygentypedef:: tsk_bool_t ************** Common options ************** .. doxygengroup:: GENERIC_FUNCTION_OPTIONS :content-only: ********** Tables API ********** The tables API section of ``tskit`` is defined in the ``tskit/tables.h`` header. ----------------- Table collections ----------------- .. doxygenstruct:: tsk_table_collection_t :members: .. doxygenstruct:: tsk_bookmark_t :members: .. doxygengroup:: TABLE_COLLECTION_API_GROUP :content-only: ----------- Individuals ----------- .. doxygenstruct:: tsk_individual_t :members: .. doxygenstruct:: tsk_individual_table_t :members: .. doxygengroup:: INDIVIDUAL_TABLE_API_GROUP :content-only: ----- Nodes ----- .. doxygenstruct:: tsk_node_t :members: .. doxygenstruct:: tsk_node_table_t :members: .. doxygengroup:: NODE_TABLE_API_GROUP :content-only: ----- Edges ----- .. doxygenstruct:: tsk_edge_t :members: .. doxygenstruct:: tsk_edge_table_t :members: .. doxygengroup:: EDGE_TABLE_API_GROUP :content-only: ---------- Migrations ---------- .. doxygenstruct:: tsk_migration_t :members: .. doxygenstruct:: tsk_migration_table_t :members: .. doxygengroup:: MIGRATION_TABLE_API_GROUP :content-only: ----- Sites ----- .. doxygenstruct:: tsk_site_t :members: .. doxygenstruct:: tsk_site_table_t :members: .. doxygengroup:: SITE_TABLE_API_GROUP :content-only: --------- Mutations --------- .. doxygenstruct:: tsk_mutation_t :members: .. doxygenstruct:: tsk_mutation_table_t :members: .. doxygengroup:: MUTATION_TABLE_API_GROUP :content-only: ----------- Populations ----------- .. doxygenstruct:: tsk_population_t :members: .. doxygenstruct:: tsk_population_table_t :members: .. doxygengroup:: POPULATION_TABLE_API_GROUP :content-only: ----------- Provenances ----------- .. doxygenstruct:: tsk_provenance_t :members: .. doxygenstruct:: tsk_provenance_table_t :members: .. doxygengroup:: PROVENANCE_TABLE_API_GROUP :content-only: .. _sec_c_api_table_indexes: ------------- Table indexes ------------- Along with the tree sequence :ref:`ordering requirements `, the :ref:`sec_table_indexes` allow us to take a table collection and efficiently operate on the trees defined within it. This section defines the rules for safely operating on table indexes and their life-cycle. The edge index used for tree generation consists of two arrays, each holding ``N`` edge IDs (where ``N`` is the size of the edge table). When the index is computed using :c:func:`tsk_table_collection_build_index`, we store the current size of the edge table along with the two arrays of edge IDs. The function :c:func:`tsk_table_collection_has_index` then returns true iff (a) both of these arrays are not NULL and (b) the stored number of edges is the same as the current size of the edge table. Updating the edge table does not automatically invalidate the indexes. Thus, if we call :c:func:`tsk_edge_table_clear` on an edge table which has an index, this index will still exist. However, it will not be considered a valid index by :c:func:`tsk_table_collection_has_index` because of the size mismatch. Similarly for functions that increase the size of the table. Note that it is possible then to have :c:func:`tsk_table_collection_has_index` return true, but the index is not actually valid, if, for example, the user has manipulated the node and edge tables to describe a different topology, which happens to have the same number of edges. The behaviour of methods that use the indexes will be undefined in this case. Thus, if you are manipulating an existing table collection that may be indexed, it is always recommended to call :c:func:`tsk_table_collection_drop_index` first. .. _sec_c_api_tree_sequences: ************** Tree sequences ************** .. doxygenstruct:: tsk_treeseq_t :members: .. doxygengroup:: TREESEQ_API_GROUP :content-only: .. _sec_c_api_trees: ***** Trees ***** .. doxygenstruct:: tsk_tree_t :members: --------- Lifecycle --------- .. doxygengroup:: TREE_API_LIFECYCLE_GROUP :content-only: .. _sec_c_api_trees_null: ---------- Null state ---------- Trees are initially in a "null state" where each sample is a root and there are no branches. The ``index`` of a tree in the null state is ``-1``. We must call one of the :ref:`seeking` methods to make the state of the tree object correspond to a particular tree in the sequence. .. _sec_c_api_trees_seeking: ------- Seeking ------- When we are examining many trees along a tree sequence, we usually allocate a single :c:struct:`tsk_tree_t` object and update its state. This allows us to efficiently transform the state of a tree into nearby trees, using the underlying succinct tree sequence data structure. The simplest example to visit trees left-to-right along the genome: .. code-block:: c :linenos: int visit_trees(const tsk_treeseq_t *ts) { tsk_tree_t tree; int ret; ret = tsk_tree_init(&tree, &ts, 0); if (ret != 0) { goto out; } for (ret = tsk_tree_first(&tree); ret == TSK_TREE_OK; ret = tsk_tree_next(&tree)) { printf("\ttree %lld covers interval left=%f right=%f\n", (long long) tree.index, tree.interval.left, tree.interval.right); } if (ret != 0) { goto out; } // Do other things in the function... out: tsk_tree_free(&tree); return ret; } In this example we first initialise a :c:struct:`tsk_tree_t` object, associating it with the input tree sequence. We then iterate over the trees along the sequence using a ``for`` loop, with the ``ret`` variable controlling iteration. The usage of ``ret`` here follows a slightly different pattern to other functions in the tskit C API (see the :ref:`sec_c_api_error_handling` section). The interaction between error handling and states of the ``tree`` object here is somewhat subtle, and is worth explaining in detail. After successful initialisation (after line 10), the tree is in the :ref:`null state` where all samples are roots. The ``for`` loop begins by calling :c:func:`tsk_tree_first` which transforms the state of the tree into the first (leftmost) tree in the sequence. If this operation is successful, :c:func:`tsk_tree_first` returns :c:data:`TSK_TREE_OK`. We then check the value of ``ret`` in the loop condition to see if it is equal to :c:data:`TSK_TREE_OK` and execute the loop body for the first tree in the sequence. On completing the loop body for the first tree in the sequence, we then execute the ``for`` loop increment operation, which calls :c:func:`tsk_tree_next` and assigns the returned value to ``ret``. This function efficiently transforms the current state of ``tree`` so that it represents the next tree along the genome, and returns :c:data:`TSK_TREE_OK` if the operation succeeds. When :c:func:`tsk_tree_next` is called on the last tree in the sequence, the state of ``tree`` is set back to the :ref:`null state` and the return value is 0. Thus, the loop on lines 11-14 can exit in two ways: 1. Either we successfully iterate over all trees in the sequence and ``ret`` has the value ``0`` at line 15; or 2. An error occurs during :c:func:`tsk_tree_first` or :c:func:`tsk_tree_next`, and ret contains a negative value. .. warning:: It is **vital** that you check the value of ``ret`` immediately after the loop exits like we do here at line 15, or errors can be silently lost. (Although it's redundant here, as we don't do anything else in the function.) .. seealso:: See the :ref:`examples` section for more examples of sequential seeking, including an example of using use :c:func:`tsk_tree_last` and :c:func:`tsk_tree_prev` to iterate from right-to-left. .. note:: Seeking functions :c:func:`tsk_tree_first`, :c:func:`tsk_tree_last`, :c:func:`tsk_tree_next` :c:func:`tsk_tree_prev`, and :c:func:`tsk_tree_seek` can be called in any order and from any non-error state. .. doxygengroup:: TREE_API_SEEKING_GROUP :content-only: ------------ Tree queries ------------ .. doxygengroup:: TREE_API_TREE_QUERY_GROUP :content-only: ------------ Node queries ------------ .. doxygengroup:: TREE_API_NODE_QUERY_GROUP :content-only: ---------------- Traversal orders ---------------- .. doxygengroup:: TREE_API_TRAVERSAL_GROUP :content-only: .. _sec_c_api_low_level_sorting: ***************** Low-level sorting ***************** In some highly performance sensitive cases it can be useful to have more control over the process of sorting tables. This low-level API allows a user to provide their own edge sorting function. This can be useful, for example, to use parallel sorting algorithms, or to take advantage of the more efficient sorting procedures available in C++. It is the user's responsibility to ensure that the edge sorting requirements are fulfilled by this function. .. todo:: Create an idiomatic C++11 example where we load a table collection file from argv[1], and sort the edges using std::sort, based on the example in tests/test_minimal_cpp.cpp. We can include this in the examples below, and link to it here. .. doxygenstruct:: _tsk_table_sorter_t :members: .. doxygengroup:: TABLE_SORTER_API_GROUP :content-only: ****************** Decoding genotypes ****************** Obtaining genotypes for samples at specific sites is achieved via :c:struct:`tsk_variant_t` and its methods. .. doxygenstruct:: tsk_variant_t :members: .. doxygengroup:: VARIANT_API_GROUP :content-only: *********************** Miscellaneous functions *********************** .. doxygenfunction:: tsk_strerror .. doxygenfunction:: tsk_is_unknown_time ************************* Function Specific Options ************************* ------------- Load and init ------------- .. doxygengroup:: API_FLAGS_LOAD_INIT_GROUP :content-only: -------------------------- :c:func:`tsk_treeseq_init` -------------------------- .. doxygengroup:: API_FLAGS_TS_INIT_GROUP :content-only: ----------------------------------------------------------------------- :c:func:`tsk_treeseq_simplify`, :c:func:`tsk_table_collection_simplify` ----------------------------------------------------------------------- .. doxygengroup:: API_FLAGS_SIMPLIFY_GROUP :content-only: ---------------------------------------------- :c:func:`tsk_table_collection_check_integrity` ---------------------------------------------- .. doxygengroup:: API_FLAGS_CHECK_INTEGRITY_GROUP :content-only: ------------------------------------ :c:func:`tsk_table_collection_clear` ------------------------------------ .. doxygengroup:: API_FLAGS_CLEAR_GROUP :content-only: ----------------------------------- :c:func:`tsk_table_collection_copy` ----------------------------------- .. doxygengroup:: API_FLAGS_COPY_GROUP :content-only: ---------------------- All equality functions ---------------------- .. doxygengroup:: API_FLAGS_CMP_GROUP :content-only: ------------------------------------- :c:func:`tsk_table_collection_subset` ------------------------------------- .. doxygengroup:: API_FLAGS_SUBSET_GROUP :content-only: ------------------------------------ :c:func:`tsk_table_collection_union` ------------------------------------ .. doxygengroup:: API_FLAGS_UNION_GROUP :content-only: ********* Constants ********* ----------- API Version ----------- .. doxygengroup:: API_VERSION_GROUP :content-only: .. _sec_c_api_error_codes: ---------------- Common constants ---------------- .. doxygengroup:: GENERIC_CONSTANTS :content-only: .. _sec_c_api_tables_api: -------------- Generic Errors -------------- .. doxygengroup:: GENERAL_ERROR_GROUP :content-only: ------------------ File format errors ------------------ .. doxygengroup:: FILE_FORMAT_ERROR_GROUP :content-only: -------------------- Out-of-bounds errors -------------------- .. doxygengroup:: OOB_ERROR_GROUP :content-only: ----------- Edge errors ----------- .. doxygengroup:: EDGE_ERROR_GROUP :content-only: ----------- Site errors ----------- .. doxygengroup:: SITE_ERROR_GROUP :content-only: --------------- Mutation errors --------------- .. doxygengroup:: MUTATION_ERROR_GROUP :content-only: ---------------- Migration errors ---------------- .. doxygengroup:: MIGRATION_ERROR_GROUP :content-only: ------------- Sample errors ------------- .. doxygengroup:: SAMPLE_ERROR_GROUP :content-only: ------------ Table errors ------------ .. doxygengroup:: TABLE_ERROR_GROUP :content-only: ------------------------ Genotype decoding errors ------------------------ .. doxygengroup:: GENOTYPE_ERROR_GROUP :content-only: ------------ Union errors ------------ .. doxygengroup:: UNION_ERROR_GROUP :content-only: --------------- Simplify errors --------------- .. doxygengroup:: SIMPLIFY_ERROR_GROUP :content-only: ----------------- Individual errors ----------------- .. doxygengroup:: INDIVIDUAL_ERROR_GROUP :content-only: ------------------- Extend edges errors ------------------- .. doxygengroup:: EXTEND_EDGES_ERROR_GROUP :content-only: .. _sec_c_api_examples: ******** Examples ******** ------------------------ Basic forwards simulator ------------------------ This is an example of using the tables API to define a simple haploid Wright-Fisher simulator. Because this simple example repeatedly sorts the edge data, it is quite inefficient and should not be used as the basis of a large-scale simulator. .. note:: This example uses the C function ``rand`` and constant ``RAND_MAX`` for random number generation. These methods are used for example purposes only and a high-quality random number library should be preferred for code used for research. Examples include, but are not limited to: 1. The `GNU Scientific Library `_, which is licensed under the GNU General Public License, version 3 (`GPL3+ `_. 2. For C++ projects using C++11 or later, the built-in `random `_ number library. 3. The `numpy C API `_ may be useful for those writing Python extension modules in C/C++. .. todo:: Give a pointer to an example that caches and flushes edge data efficiently. Probably using the C++ API? .. literalinclude:: ../c/examples/haploid_wright_fisher.c :language: c .. _sec_c_api_examples_tree_iteration: -------------- Tree iteration -------------- .. literalinclude:: ../c/examples/tree_iteration.c :language: c .. _sec_c_api_examples_tree_traversals: --------------- Tree traversals --------------- In this example we load a tree sequence file, and then traverse the first tree in four different ways: 1. We first traverse the tree in preorder and postorder using the :c:func:`tsk_tree_preorder` :c:func:`tsk_tree_postorder` functions to fill an array of nodes in the appropriate orders. This is the recommended approach and will be convenient and efficient for most purposes. 2. As an example of how we might build our own traveral algorithms, we then traverse the tree in preorder using recursion. This is a very common way of navigating around trees and can be convenient for some applications. For example, here we compute the depth of each node (i.e., it's distance from the root) and use this when printing out the nodes as we visit them. 3. Then we traverse the tree in preorder using an iterative approach. This is a little more efficient than using recursion, and is sometimes more convenient than structuring the calculation recursively. 4. In the third example we iterate upwards from the samples rather than downwards from the root. .. literalinclude:: ../c/examples/tree_traversal.c :language: c .. _sec_c_api_examples_file_streaming: -------------- File streaming -------------- It is often useful to read tree sequence files from a stream rather than from a fixed filename. This example shows how to do this using the :c:func:`tsk_table_collection_loadf` and :c:func:`tsk_table_collection_dumpf` functions. Here, we sequentially load table collections from the ``stdin`` stream and write them back out to ``stdout`` with their mutations removed. .. literalinclude:: ../c/examples/streaming.c :language: c Note that we use the value :c:macro:`TSK_ERR_EOF` to detect when the stream ends, as we don't know how many tree sequences to expect on the input. In this case, :c:macro:`TSK_ERR_EOF` is not considered an error and we exit normally. Running this program on some tree sequence files we might get:: $ cat tmp1.trees tmp2.trees | ./build/streaming > no_mutations.trees Tree sequence 0 had 38 mutations Tree sequence 1 had 132 mutations Then, running this program again on the output of the previous command, we see that we now have two tree sequences with their mutations removed stored in the file ``no_mutations.trees``:: $ ./build/streaming < no_mutations.trees > /dev/null Tree sequence 0 had 0 mutations Tree sequence 1 had 0 mutations ------------------------------------ Parallel, multichromosome simulation ------------------------------------ A substantial bottleneck in forwards simulations using tree sequences is *simplification*. This is therefore a natural target for parallelization. The potential for breaking up a chromosome into discrete chunks that are separately parallelized is limited, however, since any edge that extends across the boundary between two chunks is split; thus creating more work. However, distinct chromosomes provide a natural target: the edge tables describing inheritance for each chromosome can be independently simplified, as long as the fact that they all refer to the same set of nodes. This simulation keeps each chromosome in a separate tree sequence, but they essentially share a common node table; the :c:macro:`TSK_SIMPLIFY_NO_FILTER_NODES` flag is used so that each call to :c:func:`tsk_table_collection_simplify` does not change the common node table. Afterwards, we iterate though the edge tables to determine which nodes need to be retained, and use :c:func:`tsk_node_table_keep_rows` to remove unused nodes. .. literalinclude:: ../c/examples/multichrom_wright_fisher.c :language: c ---------------------------- Reading and writing metadata ---------------------------- The C API does not provide any functionality for manipulating the contents of metadata. For JSON metadata it is easy to parse metadata using an external JSON library, and for struct-encoded metadata the values can be directly unpacked. Examples of both can be found in `the SLiM code `_. The :ref:`"json+struct" ` metadata codec is a little less straightforward to use, so we provide here an example of how to write to it and read from it in C. See :ref:`sec_metadata_codecs_jsonstruct` for details of how the metadata is encoded. (In Python, tskit automatically decodes both JSON and binary metadata and provides it as Python-data-typed metadata, just as for other codecs.) The structure of this example is as follows: 1. Values specific to the metadata's header (e.g., the magic bytes `JBLB`). 2. Functions that encode/decode `uint64_t`, used to store the lengths of the two components in the header. 3. A method to "read" the metadata: really, to get pointers to the json and struct components. 4. A method to "write" the metadata, again just given pointers to and lengths of the two components. 5. The program itself just round-trips a very simple chunk of metadata, consisting of the JSON "`{"a": 1}`" and some binary `uint8_t` bytes ("`1234`"). .. literalinclude:: ../c/examples/json_struct_metadata.c :language: c Much of the complexity of the code is careful error checking of the lengths. Here ``json_struct_codec_get_components`` takes a pointer to binary metadata and returns pointers to *within that memory*. A different approach might have copied the two portions of the metadata into two buffers (to then be decoded, for instance). However, that would double the memory footprint, and since this codec is intended for large metadata, we did not use that approach in this example. Along the same lines, it is worth noting that this example does make a copy of the JSON and binary data when writing, in ``json_struct_codec_create_buffer()``, which doubles the memory footprint at that point, and adds the overhead of copying the data. A more efficient approach would be to calculate the buffer length needed for the codec’s data, allocate the buffer with that length, and then generate the necessary JSON and binary metadata directly into that buffer. This would require the metadata-generating code to be more closely entwined with the code for handling the json+struct codec header and padding bytes, and so we have chosen not to adopt that approach here, for pedagogical purposes; but if your use of this codec will involve large metadata, such an approach is recommended. ================================================ FILE: docs/changelogs.rst ================================================ .. note: this is left in rst format to avoid Duplicate ID issues .. _sec_changelogs: ========== Changelogs ========== ****** Python ****** .. include:: ../python/CHANGELOG.rst ***** C API ***** .. include:: ../c/CHANGELOG.rst ================================================ FILE: docs/citation.md ================================================ (sec_citation)= # Citing tskit If you use `tskit` in your work, we recommend citing the [2024 ARG Genetics paper]() and the [2016 msprime PLOS Computational Biology paper](): > Yan Wong, Anastasia Ignatieva, Jere Koskela, Gregor Gorjanc, Anthony W > Wohns, Jerome Kelleher, *A general and efficient representation of ancestral > recombination graphs*, Genetics, Volume 228, Issue 1, September 2024, iyae100, > https://doi.org/10.1093/genetics/iyae100 > Jerome Kelleher, Alison M Etheridge and Gilean McVean (2016), > *Efficient Coalescent Simulation and Genealogical Analysis for Large Sample Sizes*, > PLOS Comput Biol 12(5): e1004842. doi: 10.1371/journal.pcbi.1004842 If you use summary statistics, please cite the [2020 Genetics paper](https://doi.org/10.1534/genetics.120.303253): > Peter Ralph, Kevin Thornton, Jerome Kelleher, *Efficiently Summarizing > Relationships in Large Samples: A General Duality Between Statistics of > Genealogies and Genomes*, Genetics, Volume 215, Issue 3, 1 July 2020, > Pages 779–797, https://doi.org/10.1534/genetics.120.303253 Bibtex records: ```bibtex @article{Wong2024ARGs, author = {Wong, Yan and Ignatieva, Anastasia and Koskela, Jere and Gorjanc, Gregor and Wohns, Anthony W and Kelleher, Jerome}, title = {A general and efficient representation of ancestral recombination graphs}, journal = {Genetics}, volume = {228}, number = {1}, pages = {iyae100}, year = {2024}, doi = {10.1093/genetics/iyae100} } @article{Kelleher2016msprime, author = {Kelleher, Jerome and Etheridge, Alison M and McVean, Gilean}, title = {Efficient coalescent simulation and genealogical analysis for large sample sizes}, journal = {PLoS Computational Biology}, volume = {12}, number = {5}, pages = {e1004842}, year = {2016}, publisher = {Public Library of Science} } @article{Ralph2020Stats, author = {Ralph, Peter and Thornton, Kevin and Kelleher, Jerome}, title = {Efficiently Summarizing Relationships in Large Samples: A General Duality Between Statistics of Genealogies and Genomes}, journal = {Genetics}, volume = {215}, number = {3}, pages = {779--797}, year = {2020}, doi = {10.1534/genetics.120.303253} } ``` ================================================ FILE: docs/cli.md ================================================ --- jupytext: text_representation: extension: .md format_name: myst format_version: 0.12 jupytext_version: 1.9.1 kernelspec: display_name: Python 3 language: python name: python3 --- ```{currentmodule} tskit ``` (sec_cli)= # Command line interface ```{eval-rst} .. argparse:: :module: tskit.cli :func: get_tskit_parser :prog: python3 -m tskit ``` ================================================ FILE: docs/data-model.md ================================================ --- jupytext: text_representation: extension: .md format_name: myst format_version: 0.12 jupytext_version: 1.9.1 kernelspec: display_name: Python 3 language: python name: python3 --- :::{currentmodule} tskit ::: (sec_data_model)= # Data model The `tskit` library deals with sets of sampled genome sequences through storage and analysis of their shared genetic ancestry. This genealogical ancestry (sometimes known as an Ancestral Recombination Graph) is stored concisely in `tskit` in the "succinct tree sequence" format, which comprises a collection of easy-to-understand tables. This page documents the structure of the tables and encoding of table data, as well as the encoding of the correlated genetic trees that can be extracted from a `tskit` tree sequence. We begin by defining the the structure of the tables in the {ref}`sec_table_definitions` section. The {ref}`sec_data_model_data_encoding` section then describe how data is stored in those tables (also see the {ref}`sec_file_formats` chapter). The {ref}`sec_data_model_tree_structure` section then describes the encoding of the trees that are generated from the {class}`NodeTable` and {class}`EdgeTable`. Finally, we describe how genotype data arises from tree structure, especially how we can incorporate the idea of missing data. (sec_table_definitions)= ## Table definitions (sec_table_types_definitions)= ### Table types A tree sequence can be stored in a collection of eight tables: {ref}`Node `, {ref}`Edge `, {ref}`Individual `, {ref}`Site `, {ref}`Mutation `, {ref}`Migration `, {ref}`Population `, and {ref}`Provenance `. The Node and Edge tables store the genealogical relationships that define the trees, and the Individual table describes how multiple genomes are grouped within individuals; the Site and Mutation tables describe where mutations fall on the trees; the Migration table describes how lineages move across space; and the Provenance table contains information on where the data came from. Only Node and Edge tables are necessary to encode the genealogical trees; Sites and Mutations are optional but necessary to encode polymorphism (sequence) data; the remainder are optional. In the following sections we define these components of a tree sequence in more detail. (sec_node_table_definition)= #### Node Table A **node** defines a monoploid set of chromosomes (a "genome") of a specific individual that was born at some time in the past: the set of chromosomes inherited from a particular one of the individual's parents. (See {ref}`sec_nodes_or_individuals` for more discussion.) Every vertex in the marginal trees of a tree sequence corresponds to exactly one node, and a node may be present in many trees. The node table contains five columns, of which `flags` and `time` are mandatory: | Column | Type | Description | | :------------ | ----------- | -------------------------------------: | | flags | uint32 | Bitwise flags. | | time | double | Birth time of node. | | population | int32 | Birth population of node. | | individual | int32 | The individual the node belongs to. | | metadata | binary | Node {ref}`sec_metadata_definition`. | The `time` column records the birth time of the individual in question, and is a floating point value. Similarly, the `population` column records the ID of the population where this individual was born. If not provided, `population` defaults to the null ID (-1). Otherwise, the population ID must refer to a row in the {ref}`sec_population_table_definition`. The `individual` column records the ID of the {ref}`Individual ` individual that this node belongs to. If specified, the ID must refer to a valid individual. If not provided, `individual` defaults to the null ID (-1). The `flags` column stores information about a particular node, and is composed of 32 bitwise boolean values. Currently, the only flag defined is `NODE_IS_SAMPLE = 1`, which defines the *sample* status of nodes. Marking a particular node as a "sample" means, for example, that the mutational state of the node will be included in the genotypes produced by {meth}`TreeSequence.variants`. Bits 0-15 (inclusive) of the `flags` column are reserved for internal use by `tskit` and should not be used by applications for anything other than the purposes documented here. Bits 16-31 (inclusive) are free for applications to use for any purpose and will not be altered or interpreteted by `tskit`. See the {ref}`sec_node_requirements` section for details on the properties required for a valid set of nodes. For convenience, the {ref}`text format ` for nodes decomposes the `flags` value into its separate values. Thus, in the text format we have a column for `is_sample`, which corresponds to the `flags` column in the underlying table. As more flags values are defined, these will be added to the text file format. The `metadata` column provides a location for client code to store information about each node. See the {ref}`sec_metadata_definition` section for more details on how metadata columns should be used. :::{note} The distinction between `flags` and `metadata` is that flags holds information about a node that the library understands, whereas metadata holds information about a node that the library *does not* understand. Metadata is for storing auxiliarly information that is not necessary for the core tree sequence algorithms. ::: (sec_individual_table_definition)= #### Individual Table An **individual** defines how nodes (which can be seen as representing single chromosomes) group together in a polyploid individual. The individual table contains three columns, of which only `flags` is mandatory. | Column | Type | Description | | :------------ | ---------- | -----------------------------------------: | | flags | uint32 | Bitwise flags. | | location | double | Location in arbitrary dimensions. | | parents | int32 | Ids of parent individuals. | | metadata | binary | Individual {ref}`sec_metadata_definition`. | See the {ref}`sec_individual_requirements` section for details on the properties required for a valid set of individuals. The `flags` column stores information about a particular individual, and is composed of 32 bitwise boolean values. Currently, no flags are defined. Bits 0-15 (inclusive) of the `flags` column are reserved for internal use by `tskit` and should not be used by applications for anything other than the purposes documented here. Bits 16-31 (inclusive) are free for applications to use for any purpose and will not be altered or interpreteted by `tskit`. The `location` column stores the location of an individual in arbitrary dimensions. This column is {ref}`ragged `, and so different individuals can have locations with different dimensions (i.e., one individual may have location `[]` and another `[0, 1, 0]`. This could therefore be used to store other quantities (e.g., phenotype). The `parents` column stores the ids of other individuals that are the parents of an individual. This can be used to store pedigree information for individuals. This column is {ref}`ragged ` such that an individual can have any number of parents. The `metadata` column provides a location for client code to store information about each individual. See the {ref}`sec_metadata_definition` section for more details on how metadata columns should be used. :::{note} The distinction between `flags` and `metadata` is that flags holds information about a individual that the library understands, whereas metadata holds information about a individual that the library *does not* understand. Metadata is for storing auxiliarly information that is not necessary for the core tree sequence algorithms. ::: (sec_edge_table_definition)= #### Edge Table An **edge** defines a parent-child relationship between a pair of nodes over a specific sequence interval. The edge table contains five columns, all of which are mandatory except `metadata`: | Column | Type | Description | | :------------ | ---------- | -----------------------------------------: | | left | double | Left coordinate of the edge (inclusive). | | right | double | Right coordinate of the edge (exclusive). | | parent | int32 | Parent node ID. | | child | int32 | Child node ID. | | metadata | binary | Node {ref}`sec_metadata_definition`. | Each row in an edge table describes a half-open genomic interval `[left, right)` over which the `child` inherited from the given `parent`. The `left` and `right` columns are defined using double precision floating point values. The `parent` and `child` columns specify integer IDs in the associated {ref}`sec_node_table_definition`. The `metadata` column provides a location for client code to store information about each edge. See the {ref}`sec_metadata_definition` section for more details on how metadata columns should be used. See the {ref}`sec_edge_requirements` section for details on the properties required for a valid set of edges. (sec_site_table_definition)= #### Site Table A **site** defines a particular location along the genome in which we are interested in observing the allelic state. The site table contains three columns, of which `position` and `ancestral_state` are mandatory. | Column | Type | Description | | :-------------- | ---------- | -----------------------------------------: | | position | double | Position of site in genome coordinates. | | ancestral_state | text | The state at the root of the tree. | | metadata | binary | Site {ref}`sec_metadata_definition`. | The `position` column is a floating point value defining the location of the site in question along the genome. The `ancestral_state` column specifies the allelic state at the root of the tree, thus defining the state that nodes inherit if no mutations intervene. The column stores text character data of arbitrary length. The `metadata` column provides a location for client code to store information about each site. See the {ref}`sec_metadata_definition` section for more details on how metadata columns should be used. See the {ref}`sec_site_requirements` section for details on the properties required for a valid set of sites. (sec_mutation_table_definition)= #### Mutation Table A **mutation** defines a change of allelic state on a tree at a particular site. The mutation table contains five columns, of which `site`, `node` and `derived_state` are mandatory. | Column | Type | Description | | :-------------- | ---------- | ---------------------------------------------: | | site | int32 | The ID of the site the mutation occurs at. | | node | int32 | The node this mutation occurs at. | | parent | int32 | The ID of the parent mutation. | | time | double | Time at which the mutation occurred. | | derived_state | char | The allelic state resulting from the mutation. | | metadata | binary | Mutation {ref}`sec_metadata_definition`. | The `site` column is an integer value defining the ID of the {ref}`site ` at which this mutation occurred. The `node` column is an integer value defining the ID of the first {ref}`node ` in the tree below this mutation. The `time` column is a double precision floating point value recording how long ago the mutation happened. The `derived_state` column specifies the allelic state resulting from the mutation, thus defining the state that the `node` and any descendant nodes in the subtree inherit unless further mutations occur. The column stores text character data of arbitrary length. The `parent` column is an integer value defining the ID of the mutation whose allelic state this mutation replaced. If there is no mutation at the site in question on the path back to root, then this field is set to the null ID (-1). (The `parent` column is only required in situations where there are multiple mutations at a given site. For "infinite sites" mutations, it can be ignored.) The `metadata` column provides a location for client code to store information about each site. See the {ref}`sec_metadata_definition` section for more details on how metadata columns should be used. See the {ref}`sec_mutation_requirements` section for details on the properties required for a valid set of mutations. (sec_migration_table_definition)= #### Migration Table :::{note} Encoding migration in the migrations table is a legacy approach associated with older versions of `msprime`; recording movement between populations in the migration table is entirely optional, even when related nodes are assigned to different populations. ::: :::{warning} The migration table may be entirely removed from the `tskit` data model in the future. Meanwhile, a number of `tskit` functions, such as {meth}`~TreeSequence.simplify()` will raise an error if data exists in the migrations table. ::: :::{seealso} The {ref}`msprime:sec_ancestry_record_migrations` sections and the associated discussion of {ref}`msprime:sec_demography_migration` in the `msprime` documentation. ::: In simulations, trees can be thought of as spread across space, and it is helpful for inferring demographic history to record this history. Migrations are performed by individual ancestors, but might not be tagged by an individual whose genome is tracked as a `node` (as in a discrete-deme model they are unlikely to be both a migrant and a most recent common ancestor). So, `tskit` can record separately when a segment of ancestry has moved between populations. This table is not required, even if different nodes come from different populations. | Column | Type | Description | | :--------- | -------- | -----------------------------------------------------: | | left | double | Left coordinate of the migrating segment (inclusive). | | right | double | Right coordinate of the migrating segment (exclusive). | | node | int32 | Node ID. | | source | int32 | Source population ID. | | dest | int32 | Destination population ID. | | time | double | Time of migration event. | | metadata | binary | Migration {ref}`sec_metadata_definition`. | The `left` and `right` columns are floating point values defining the half-open segment of genome affected (these need not exactly correspond to breakpoints between edges). The `source` and `dest` columns record the IDs of the respective populations (note that by `msprime` convention, "source" and "destination" are defined in reverse time, see {ref}`msprime:sec_demography_direction_of_time`.). The `time` column holds floating point values recording the time of the event, with migrations assumed to occur instantaneously. The `node` column records the ID of the child node of the migrating segment; in consequence the population ID of the `node` will match the `src` ID (unless sequential migrations affect the same `node`, in which case it will match the `src` value of the youngest of those migrations). The `metadata` column provides a location for client code to store information about each migration. See the {ref}`sec_metadata_definition` section for more details on how metadata columns should be used. See the {ref}`sec_migration_requirements` section for details on the properties required for a valid set of migrations. (sec_population_table_definition)= #### Population Table A **population** defines a grouping of individuals that a node can be said to belong to. The population table contains one column, `metadata`. | Column | Type | Description | | :--------- | -------- | -----------------------------------------: | | metadata | binary | Population {ref}`sec_metadata_definition`. | The `metadata` column provides a location for client code to store information about each population. See the {ref}`sec_metadata_definition` section for more details on how metadata columns should be used. See the {ref}`sec_population_requirements` section for details on the properties required for a valid set of populations. (sec_provenance_table_definition)= #### Provenance Table | Column | Type | Description | | :-------- | ----- | ----------------------------------------------------------------------: | | timestamp | char | Timestamp in [ISO-8601](https://en.wikipedia.org/wiki/ISO_8601) format. | | record | char | Provenance record as JSON. | (sec_metadata_definition)= ### Metadata Each table (excluding provenance) has a metadata column for storing and passing along information that tskit does not use or interpret. See {ref}`sec_metadata` for details. The metadata columns are {ref}`binary columns `. When using the {ref}`sec_text_file_format`, metadata values are written as opaque text. By default, :meth:`TreeSequence.dump_text` will base64-encode metadata values that are stored as raw bytes (when ``base64_metadata=True``) so that binary data can be safely printed and exchanged; in this case :func:`tskit.load_text` will base64-decode the corresponding text fields back to bytes. When metadata has already been decoded to a structured Python object (for example via a metadata schema), the textual representation written by :meth:`TreeSequence.dump_text` is the ``repr`` of that object, and :func:`tskit.load_text` does not attempt to reconstruct the original structured value from this representation. For reliable metadata round-tripping, prefer the native binary tree sequence file format over the text formats. The tree sequence itself also has metadata stored as a byte array. (sec_valid_tree_sequence_requirements)= ### Valid tree sequence requirements Arbitrary data can be stored in tables using the classes in the {ref}`sec_tables_api`. The {meth}`TableCollection.tree_sequence` method can be used to turn such a {class}`TableCollection` into an immutable {class}`TreeSequence` object, but this requires the tables to fulfil a specific set of requirements. In this section we list these requirements, and explain their rationale. Violations of most of these requirements are detected when the user attempts to load a tree sequence via {func}`tskit.load` or {meth}`TableCollection.tree_sequence`, raising an informative error message. Some more complex requirements may not be detectable at load-time, and errors may not occur until certain operations are attempted. These are documented below. At the tree-sequence level, we require that the coordinate space has a finite, strictly positive length; that is, the `sequence_length` attribute must be a finite value greater than zero. The Python API also provides tools that can transform a collection of tables into a valid collection of tables, so long as they are logically consistent, see {ref}`sec_tables_api_creating_valid_tree_sequence`. (sec_individual_requirements)= #### Individual requirements Individuals are a basic type in a tree sequence and are not defined with respect to any other tables. Individuals can have a reference to their parent individuals, if present these references must be valid or null (-1). An individual cannot list itself as its own parent. A valid tree sequence does not require individuals to be sorted in any particular order, and sorting a set of tables using {meth}`TableCollection.sort` has no effect on individuals. However, individuals can be optionally sorted using {meth}`TableCollection.sort_individuals`. (sec_node_requirements)= #### Node requirements Given a valid set of individuals and populations, the requirements for each node are: - `time` must be a finite (non-NaN, non-infinite) value; - `population` must either be null (-1) or refer to a valid population ID; - `individual` must either be null (-1) or refer to a valid individual ID. An ID refers to a zero-indexed row number in the relevant table, and so is "valid" if is between 0 and one less than the number of rows in the relevant table. There are no requirements regarding the ordering of nodes with respect to time. Sorting a set of tables using {meth}`TableCollection.sort` has no effect on nodes. (sec_edge_requirements)= #### Edge requirements Given a valid set of nodes and a sequence length {math}`L`, the simple requirements for each edge are: - We must have finite coordinates with {math}`0 \leq` `left` {math}`<` `right` {math}`\leq L`; - `parent` and `child` must be valid node IDs; - `time[parent]` > `time[child]`; - edges must be unique (i.e., no duplicate edges are allowed). The first requirement simply ensures that the interval makes sense. The third requirement ensures that we cannot have loops, since time is always increasing as we ascend the tree. To ensure a valid tree sequence there is one further requirement: - The set of intervals on which each node is a child must be disjoint. This guarantees that we cannot have contradictory edges (i.e., where a node `a` is a child of both `b` and `c`), and ensures that at each point on the sequence we have a well-formed forest of trees. In the interest of algorithmic efficiency, edges must have the following sortedness properties: - All edges for a given parent must be contiguous; - Edges must be listed in nondecreasing order of `parent` time; - Within the edges for a given `parent`, edges must be sorted first by `child` ID and then by `left` coordinate. Violations of these requirements are detected at load time. The {meth}`TableCollection.sort` method will ensure that these sortedness properties are fulfilled. (sec_site_requirements)= #### Site requirements Given a valid set of nodes and a sequence length {math}`L`, the simple requirements for a valid set of sites are: - We must have a finite coordinate with {math}`0 \leq` `position` {math}`< L`; - `position` values must be unique. For simplicity and algorithmic efficiency, sites must also: - Be sorted in increasing order of `position`. Violations of these requirements are detected at load time. The {meth}`TableCollection.sort` method ensures that sites are sorted according to these criteria. (sec_mutation_requirements)= #### Mutation requirements Given a valid set of nodes, edges and sites, the requirements for a valid set of mutations are: - `site` must refer to a valid site ID; - `node` must refer to a valid node ID; - `time` must either be `UNKNOWN_TIME` (a NAN value which indicates the time is unknown) or be a finite value which is greater or equal to the mutation `node`'s `time`, less than the `node` above the mutation's `time` and equal to or less than the `time` of the `parent` mutation if this mutation has one. If one mutation on a site has `UNKNOWN_TIME` then all mutations at that site must, as a mixture of known and unknown is not valid. - `parent` must either be the null ID (-1), if the mutation has no parent, or a valid mutation ID within the current table. Furthermore, - The `parent` value must be consistent with the topology of the tree at the site of the mutation, such that a path from the child mutation to the parent mutation exists without passing through any other mutations at the same site. For simplicity and algorithmic efficiency, mutations must also: - be sorted by site ID; - when there are multiple mutations per site, mutations should be ordered by decreasing time, if known, and parent mutations must occur **before** their children (i.e. if a mutation with ID {math}`x` has `parent` with ID {math}`y`, then we must have {math}`y < x`). Violations of these sorting requirements are detected at load time. The {meth}`TableCollection.sort` method ensures that mutations are sorted according site ID, but does not at present enforce that mutations occur after their parent mutations. Silent mutations (i.e., mutations for which the ancestral and derived states are the same) are allowed. For example, if we have a site with ancestral state of "A" and a single mutation with derived state "A", then this mutation does not result in any change of state. (This addition was made in release C_0.99.11.) :::{note} As `tskit.UNKNOWN_TIME` is implemented as a `NaN` value, tests for equality will always fail. Use `tskit.is_unknown_time` to detect unknown values. ::: (sec_migration_requirements)= #### Migration requirements Given a valid set of nodes and edges, the requirements for a valid set of migrations are: - `left` and `right` must be finite values that lie within the tree sequence coordinate space (i.e., from 0 to `sequence_length`), with {math}`0 \leq` `left` {math}`<` `right` {math}`\leq L`; - `node` must be a valid node ID; - if population references are checked, `source` and `dest` must be valid population IDs; - `time` must be a finite value. To enable efficient processing, migrations must also be sorted by nondecreasing `time` value. Conceptually, a migration records that a segment of ancestry for the given `node` moves between populations along the tree. In typical demographic models we expect: - `time` to lie strictly between the time of the migrating `node` and the time of any ancestral node from which that node inherits on the segment `[left, right)`; - the `population` of any such ancestor to match the `source` population, until another `migration` intervenes. These conceptual relationships are not currently validated. It is the responsibility of code that creates migrations to satisfy them where required. Note in particular that there is no requirement that adjacent migration records should be "squashed". That is, we can have two records `m1` and `m2` such that `m1.right` = `m2.left` and with the `node`, `source`, `dest` and `time` fields equal. This is because such records will usually represent two independent ancestral segments migrating at the same time, and as such squashing them into a single record would result in a loss of information. (sec_population_requirements)= #### Population requirements There are no requirements on a population table. (sec_provenance_requirements)= #### Provenance requirements The `timestamp` column of a provenance table should be in [ISO-8601](https://en.wikipedia.org/wiki/ISO_8601) format. The `record` column stores a JSON document describing how and where the tree sequence was produced. For tree sequences generated by tskit and related tools, this JSON is expected to conform to the :ref:`provenance schema ` described in {ref}`sec_provenance`. (sec_table_indexes)= ### Table indexes To efficiently iterate over the trees in a tree sequence, `tskit` uses indexes built on the edges. To create a tree sequence from a table collection the tables must be indexed; the {meth}`TableCollection.build_index` method can be used to create an index on a table collection if necessary. :::{todo} Add more details on what the indexes actually are. ::: (sec_data_model_saving)= ### Saving to file When serializing (e.g. storing a {class}`TreeSequence` to disk using {meth}`dump`), the underlying tables are stored along with the indexes, top-level metadata, attributes such as the sequence length and time units, and the {ref}`sec_data_model_reference_sequence` if it exists. {func}`Loading ` such a file returns an immutable tree sequence object, with pre-calculated indexes immediately available. See the {ref}`sec_tree_sequence_file_format` section for more details. Although data in a raw {class}`TableCollection` need not conform to the {ref}`sec_valid_tree_sequence_requirements`, it too can be {meth}`dumped ` to a file (with indexes stored if they exist). (sec_data_model_data_encoding)= ## Data encoding In this section we describe the high-level details of how data is encoded in tables. Tables store data in a **columnar** manner. In memory, each table is organised as a number of blocks of contiguous storage, one for each column. There are many advantages to this approach, but the key property for us is that allows for very efficient transfer of data in and out of tables. Rather than inserting data into tables row-by-row (which can be done in Python {ref}`using the add_row methods`), it is much more efficient to add many rows at the same time by providing pointers to blocks of contiguous memory. By taking this approach, we can work with tables containing gigabytes of data very efficiently. For instance, in the {ref}`sec_python_api` we can use the [numpy Array API](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.html) to allow us to define and work with numeric arrays of the required types. Node IDs, for example, are defined using 32 bit integers. Thus, the `parent` column of an {ref}`sec_edge_table_definition`'s with `n` rows is a block `4n` bytes. This approach is very straightforward for columns in which each row contains a fixed number of values. However, dealing with columns containing a **variable** number of values is more problematic. (sec_encoding_ragged_columns)= ### Encoding ragged columns A **ragged** column is a column in which the rows are not of a fixed length. For example, {ref}`sec_metadata_definition` columns contain binary of data of arbitrary length. To encode such columns in the tables API, we store **two** columns: one contains the flattened array of data and another stores the **offsets** of each row into this flattened array. Consider an example: ```{code-cell} ipython3 import tskit s = tskit.SiteTable() s.add_row(0, "A") s.add_row(0, "") s.add_row(0, "TTT") s.add_row(0, "G") s ``` In this example we create a {ref}`sec_site_table_definition` with four rows, and then display this table. We can see that the second row has the empty string as its `ancestral_state`, and the third row's `ancestral_state` is `TTT`. Now let's print out the columns: ```{code-cell} ipython3 print("Ancestral state (numerical): ", s.ancestral_state) print("Ancestral state (as bytes): ", s.ancestral_state.tobytes()) print("Ancestral state offsets: ", s.ancestral_state_offset) ``` When we print out the tables `ancestral_state` column, we see that its a numpy array of length 5: this is the flattened array of [ASCII encoded](https://en.wikipedia.org/wiki/ASCII) values for these rows. When we decode these bytes using the numpy {meth}`tobytes` method, we get the string 'ATTTG'. This flattened array can now be transferred efficiently in memory like any other column We then use the `ancestral_state_offset` column to allow us find the individual rows. For a row `j`: ancestral_state[ancestral_state_offset[j]: ancestral_state_offset[j + 1]] gives us the array of bytes for the ancestral state in that row. For example, here is row 2: ```{code-cell} ipython3 s.ancestral_state[s.ancestral_state_offset[2]: s.ancestral_state_offset[3]].tobytes() ``` For a table with `n` rows, any offset column must have `n + 1` values, the first of which is always `0`. The values in this column must be nondecreasing, and cannot exceed the length of the ragged column in question. (sec_data_model_reference_sequence)= ## Reference sequence Along with the topology and site information stored in the tskit tree sequence, we can also optionally store an associated reference sequence. Reference sequences are flexible, and can consist simply of some metadata recording which assembly build a tree sequence uses, or storing the entire sequence itself. :::{warning} Reference sequence support in tskit is preliminary. Reference sequence data can be stored and accessed via the C API. Support in the Python API is limited to usage in {meth}`.TreeSequence.alignments` and related methods, where it provides the default values for nucleotide positions between {ref}`sites`. ::: (sec_data_model_tree_structure)= ## Tree structure (sec_data_model_quintuply_linked_trees)= ### Quintuply linked trees Tree structure in `tskit` is encoded internally as a "quintuply linked tree", a generalisation of the triply linked tree encoding used by Knuth and others. Nodes are represented by their integer IDs, and their relationships to other nodes are recorded in the `parent`, `left_child`, `right_child`, `left_sib` and `right_sib` arrays. For example, consider the following tree and its associated arrays: ```{code-cell} ipython3 :tags: ["hide-input"] import io import tskit from IPython.display import SVG nodes = """\ id is_sample time 0 1 0 1 1 0 2 1 0 3 1 0 4 1 0 5 0 1 6 0 2 7 0 3 """ edges = """\ left right parent child 0 60 5 4,3 0 40 6 2 0 60 6 1,0 20 40 6 5 0 20 7 5 40 60 7 5 0 60 7 6 40 60 7 2 """ ts = tskit.load_text( nodes=io.StringIO(nodes), edges=io.StringIO(edges), strict=False ) SVG(ts.first().draw_svg(time_scale="rank")) ``` ```{code-cell} ipython3 :tags: ["hide-input"] from IPython.display import HTML def html_quintuple_table(ts, show_virtual_root=False, show_convenience_arrays=False): tree = ts.first() columns = ["node", "parent", "left_child", "right_child", "left_sib", "right_sib"] convenience_arrays = ["num_children", "edge"] if show_convenience_arrays: columns += convenience_arrays data = {k:[] for k in columns} for u in sorted(tree.nodes(tree.virtual_root if show_virtual_root else None)): for colname in columns: data[colname].append(u if colname == "node" else getattr(tree, colname)(u)) html = "" for colname in columns: html += f"{colname}" html += "" for u in range(len(data["node"])): html += "" if u < ts.num_nodes else "" for colname in columns: html += f"{data[colname][u]}" html += "" return "" + html + "
" HTML(html_quintuple_table(ts)) ``` Each node in the tree corresponds to a row in this table, and the columns are the individual arrays recording the quintuply linked structure. Thus, we can see that the parent of nodes `0`, `1`, and `2` is `6`. Similarly, the left child of `6` is `0` and the right child of `6` is `2`. The `left_sib` and `right_sib` arrays then record each nodes sibling on its left or right, respectively; hence the right sib of `0` is `1`, and the right sib of `1` is `2`. Thus, sibling information allows us to efficiently support trees with arbitrary numbers of children. In each of the five pointer arrays, the null node (-1) is used to indicate the end of a path; thus, for example, the parent of `7` and left sib of `0` are null. Please see this {ref}`example ` for details of how to use the quintuply linked structure in the C API. :::{note} For many applications we do not need the quintuply linked trees, and (for example) the `left_sib` and `right_child` arrays can be ignored. The reason for using a quintuply instead of triply linked encoding is that it is not possible to efficiently update the trees as we move along the sequence without the quintuply linked structure. ::: :::{warning} The left-to-right ordering of nodes is determined by the order in which edges are inserted into the tree during iteration along the sequence. Thus, if we arrive at the same tree by iterating from different directions, the left-to-right ordering of nodes may be different! The specific ordering of the children of a node should therefore not be depended on. ::: ### Convenience arrays Similar to the five arrays representing the {ref}`quintuply linked tree`, convenience arrays track information on each node in the tree. These arrays are not essential to represent the trees within a tree sequence. However, they can be useful for specific algorithms (e.g. when computing tree (im)balance metrics). Two convenience arrays have been implemented so far: {attr}`Tree.num_children_array` and {attr}`Tree.edge_array`. Here is the table above with the convenience arrays also shown: ```{code-cell} ipython3 :tags: ["hide-input"] HTML(html_quintuple_table(ts, show_convenience_arrays=True)) ``` (sec_data_model_tree_roots)= ### Roots In the `tskit` {class}`trees ` we have shown so far, all the sample nodes have been connected to each other. This means each tree has only a single {attr}`~Tree.root` (i.e. the oldest node found when tracing a path backwards in time from any sample). However, a tree can contain {ref}`sec_data_model_tree_isolated_sample_nodes` or unconnected topologies, and can therefore have *multiple* {attr}`~Tree.roots`. Here's an example, created by deleting the edge joining `6` and `7` in the tree sequence used above: ```{code-cell} ipython3 :tags: ["hide-input"] tables = ts.dump_tables() tables.edges.truncate(ts.num_edges - 1) ts_multiroot = tables.tree_sequence() SVG(ts_multiroot.first().draw_svg(time_scale="rank")) ``` In `tskit` terminology, this should *not* be thought of as two separate trees, but as a single multi-root "tree", comprising two unlinked topologies. This fits with the definition of a tree in a tree sequence: a tree describes the ancestry of the same fixed set of sample nodes at a single position in the genome. In the picture above, *both* the left and right hand topologies are required to describe the genealogy of samples 0..4 at this position. Here's what the entire tree sequence now looks like: ```{code-cell} ipython3 :tags: ["hide-input"] SVG(ts_multiroot.draw_svg(time_scale="rank")) ``` From the terminology above, it can be seen that this tree sequence consists of only three trees (not five). The first tree, which applies from position 0 to 20, is the one used in our example. As we saw, removing the edge connecting node 6 to node 7 has created a tree with 2 roots (and thus 2 unconnected topologies in a single tree). In contrast, the second tree, from position 20 to 40, has a single root. Finally the third tree, from position 40 to 60, again has two roots. (sec_data_model_tree_root_threshold)= #### The root threshold The roots of a tree are defined by reference to the {ref}`sample nodes`. By default, roots are the unique endpoints of the paths traced upwards from the sample nodes; equivalently, each root counts one or more samples among its descendants (or is itself a sample node). This is the case when the {attr}`~Tree.root_threshold` property of a tree is left at its default value of `1`. If, however, the `root_threshold` is (say) `2`, then a node is considered a root only if it counts at least two samples among its descendants. Setting an alternative `root_threshold` value can be used to avoid visiting {ref}`sec_data_model_tree_isolated_sample_nodes`, for example when dealing with trees containing {ref}`sec_data_model_missing_data`. (sec_data_model_tree_virtual_root)= #### The virtual root To access all the {attr}`~Tree.roots` in a tree, tskit uses a special additional node called the **virtual root**. This is primarily a bookkeeping device, and can normally be ignored: it is not plotted in any visualizations and does not exist as an independent node in the node table. However, the virtual root can be useful in certain algorithms because its children are defined as all the "real" roots in a tree. Hence by descending downwards from the virtual root, it is possible to access the entire genealogy at a given site, even in a multi-root tree. In the quintuply linked tree encoding, the virtual root appears as an extra element at the end of each of the tree arrays. Here's the same table as before but with the virtual root also shown, using red italics to emphasise that it is not a "real" node: ```{code-cell} ipython3 :tags: ["hide-input"] HTML(html_quintuple_table(ts_multiroot, show_virtual_root=True)) ``` You can see that the virtual root (node 8) has 6 as its left child and 7 as its right child. Importantly, though, this is an asymmetric relationship: the parent of the "real" roots 6 and 7 is null (-1) and *not* the virtual root. Hence when we ascend up the tree from the sample nodes to their parents, we stop at the "real" roots, and never encounter the virtual root. Because the virtual root can be useful in some algorithms, it can optionally be returned in traversal orders (see {meth}`.Tree.nodes`). The following properties apply: - All trees in a tree sequence share the same virtual root. - The virtual root's ID is always equal to the number of nodes in the tree sequence (i.e. the length of the node table). However, there is **no corresponding row** in the node table, and any attempts to access information about the virtual root via either the tree sequence or tables APIs will fail with an out-of-bounds error. - The parent and siblings of the virtual root are null. - The time of the virtual root is defined as positive infinity (if accessed via {meth}`.Tree.time`). This is useful in defining the time-based node traversal orderings. - The virtual root is the parent of no other node---roots do **not** have parent pointers to the virtual root. (sec_data_model_tree_isolated_nodes)= ### Isolated nodes In a tree, it is possible for a node to have no children and no parent. Such a node is said to be *isolated*, meaning that we don't know anything about its relationships over a specific genomic interval. This is commonly true for ancestral genomes, which often have large regions that have not been inherited by any of the {ref}`sample nodes` in the tree sequence, and therefore regions about which we know nothing. This is true, for example, of node 7 in the middle tree of our previous example, which is why it is not plotted on that tree: ```{code-cell} ipython3 display(SVG(ts_multiroot.draw_svg(time_scale="rank"))) for tree in ts_multiroot.trees(): print( "Node 7", "is" if tree.is_isolated(7) else "is not", "isolated from position", tree.interval.left, "to", tree.interval.right, ) ``` (sec_data_model_tree_isolated_sample_nodes)= #### Isolated sample nodes It is also possible for a {ref}`sample node` to be isolated. As long as the {ref}`root threshold` is set to its default value, an isolated *sample* node will count as a root, and therefore be considered as being present on the tree (meaning it will be returned by the {meth}`Tree.nodes` and {meth}`Tree.samples` methods). When displaying a tree, isolated samples are shown unconnected to other nodes. To illustrate, we can remove the edge from node 2 to node 7: ```{code-cell} ipython3 :tags: ["hide-input"] tables = ts_multiroot.dump_tables() tables.edges.set_columns( **tables.edges[(tables.edges.parent != 7) | (tables.edges.child != 2)].asdict()) ts_isolated = tables.tree_sequence() SVG(ts_isolated.draw_svg(time_scale="rank")) ``` The rightmost tree now contains an isolated sample node (node 2), which counts as one of the {ref}`sec_data_model_tree_roots` of the tree. This tree therefore has three roots, one of which is node 2: ```{code-cell} ipython3 rightmost_tree = ts_isolated.at_index(-1) print(rightmost_tree.num_roots, "roots in the rightmost tree, with IDs", rightmost_tree.roots) print( "IDs of isolated samples in this tree:", [u for u in rightmost_tree.samples() if rightmost_tree.is_isolated(u)], ) ``` In `tskit`, isolated sample nodes are closely associated with the encoding of {ref}`sec_data_model_missing_data`. (sec_data_model_tree_dead_leaves_and_branches)= ### Dead leaves and branches In a `tskit` tree, a *leaf node* is defined as a node without any children. The implications of this turn out to be slighly unintuitive, and so are worth briefly documenting here. Firstly, the same node can be a leaf in one tree, and not a leaf in the next tree along the tree sequence. Secondly all isolated nodes must be leaves (as by definition they have no children). Thirdly sample nodes need not be leaves (they could be "internal samples"); likewise leaf nodes need not be samples. Node 7 in the example above provides a good case study. Note that it is a root node with at least one child (i.e. not a leaf) in trees 0 and 2; in contrast in tree 1 it is isolated. Strictly, because it is isolated in tree 1, it is also a leaf node there, although it is not attached to a root, not a sample, and is therefore not plotted. In this case, in that tree we can think of node 7 as a "dead leaf" (and we don't normally plot dead leaves). In fact, in a large tree sequence of many trees, most ancestral nodes will be isolated in any given tree, and therefore most nodes in such a tree will be of this sort. However, these dead leaves are excluded from most calculations on trees, because algorithms usually traverse the tree by starting at a root and working down, or by starting at a sample and working up. Hence when we refer to the leaves of a tree, it is usually shorthand for the leaves **on** the tree (that is, attached via branches, to one of the the tree roots). Dead leaves are excluded from this definition. Note that it is also possible to have trees in which there are "dead branches": that is sections of topology which are not accessible from a root, and whose tips are all dead leaves. Although valid, this is a relatively unusual state of affairs, and such branches are not plotted by the standard {ref}`sec_tskit_viz` methods. The {meth}`Tree.nodes` method will not, by default, traverse through dead branches, although it can be made to do so by specifying the ID of a dead node as the root for traversal. (sec_data_model_genetic_data)= ## Encoding genetic variation Genetic variation is incorporated into a tree sequence by placing {ref}`mutations` at {ref}`sites` along the genome. The genotypes of the different samples at each site can be found by using the tree to calculate which mutations are inherited by the different samples. This is the fundamental basis of how tree sequences efficiently encode DNA sequences, and is explained in depth elsewhere (e.g. {ref}`in the tutorials`). Below, we discuss some implications of this encoding in more detail, in particular the way in which it can be used to model missing data. (sec_data_model_missing_data)= ### Missing data If, at a particular genomic position, a node is {ref}`isolated` *and* additionally has no mutations directly above it, its genotype at that position is considered to be unknown (however, if there is a mutation above an isolated node, it can be thought of as saying directly what the genotype is, and so renders the genotype at that position not missing). By way of illustration, we'll use the {meth}`~TableCollection.delete_intervals` method to remove all knowledge of the ancestry in the middle portion of the previous example (say from position 15 to 45) sprinkle on some mutations, and make sure there are sites at every position: ```{code-cell} ipython3 :tags: ["hide-input"] import numpy as np import msprime tables = msprime.sim_mutations(ts_isolated, rate=0.1, random_seed=123).dump_tables() tables.delete_intervals([[15, 45]], simplify=False) missing_sites = np.setdiff1d(np.arange(tables.sequence_length), tables.sites.position) for pos in missing_sites: tables.sites.add_row(position=pos, ancestral_state="A") # Add sites at every pos tables.sort() missing_ts = tables.tree_sequence() SVG(missing_ts.draw_svg()) ``` The middle section of the genome now has no ancestry at all, and therefore for any site that is in this region, the genotypic state that it is assigned is a special value `tskit.MISSING_DATA`, or `-1`. The {meth}`~TreeSequence.haplotypes()` method, which outputs the actual allelic state for each sample, defaults to outputting an `N` at these sites. Therefore where any sample node is isolated, the haplotype will show an `N`, indicating the DNA sequence is unknown. This will be so not only in the middle of all of the sample genomes, but also at the right hand end of the genome of sample 2, as it is an isolated sample node in the rightmost tree: ```{code-cell} ipython3 for i, h in enumerate(missing_ts.haplotypes()): print(f"Sample {i}: {h}") ``` See the {meth}`TreeSequence.variants` method and {class}`Variant` class for more information on how missing data is represented in variant data. (sec_gotchas)= ## Possibly surprising consequences of the data model This is a section of miscellaneous issues that might trip even an experienced user up, also known as "gotchas". The current examples are quite uncommon, so can be ignored for most purposes, but the list may be expanded in the future. ### Unrelated material Usually, all parts of a tree sequence are ancestral to at least one sample, since that's essentially the definition of a sample: the genomes that we're describing the ancestry of. However, in some cases there will be portions of the tree sequence from which no samples inherit - notably, the result of a forwards simulation that has not been simplified. In fact, if the simulation has not coalesced, one can have entire portions of some marginal tree that are unrelated to any of the samples (for instance, an individual in the initial generation of the simulation that had no offspring). This can lead to a gotcha: the *roots* of a tree are defined to be only those roots *reachable from the samples* (and, furthermore, reachable from at least `root_threshold` samples; see {meth}`TreeSequence.trees`). So, our unlucky ancestor would not appear in the list of `roots`, even though if we drew all the relationships provided by the tree sequence, they'd definitely be a root. Furthermore, only nodes *reachable from a root* are included in the {meth}`Tree.nodes`. So, if you iterate over all the nodes in each marginal tree, you won't see those parts of the tree sequence that are unrelated to the samples. If you need to get those, too, you could either work with the {meth}`TreeSequence.edge_diffs` directly, or iterate over all nodes (instead of over {meth}`Tree.nodes`). ================================================ FILE: docs/development.md ================================================ --- jupytext: text_representation: extension: .md format_name: myst format_version: 0.12 jupytext_version: 1.9.1 kernelspec: display_name: Python 3 language: python name: python3 --- ```{currentmodule} tskit ``` (sec_development)= # Development If you would like to add some features to `tskit`, this documentation should help you get set up and contributing. Please help us to improve the documentation by either opening an [issue](http://github.com/tskit-dev/tskit/issues) or [pull request](http://github.com/tskit-dev/tskit/pulls) if you see any problems. The tskit-dev team strives to create a welcoming and open environment for contributors; please see our [code of conduct](https://github.com/tskit-dev/.github/blob/main/CODE_OF_CONDUCT.md) for details. We wish our code and documentation to be [inclusive](https://chromium.googlesource.com/chromium/src/+/master/styleguide/inclusive_code.md) and in particular to be gender and racially neutral. (sec_development_repo_admin)= ## Repo administration tskit is one of several packages in the tskit-dev ecosystem. Shared conventions for CI workflows, dependency management, repository layout, and releases are documented in the [repo administration guide](https://github.com/tskit-dev/.github/blob/main/repo_administration.md) in the `tskit-dev/.github` repository. Maintainers should read that document before making changes to CI configuration, dependency groups, or the release process. (sec_development_structure)= ## Project structure Tskit is a multi-language project, which is reflected in the directory structure: - The `python` directory contains the Python library and command line interface, which is what most contributors are likely to be interested in. Please see the {ref}`sec_development_python` section for details. The low-level {ref}`sec_development_python_c` is also defined here. - The `c` directory contains the high-performance C library code. Please see the {ref}`sec_development_c` for details on how to contribute. - The `docs` directory contains the source for this documentation, which covers both the Python and C APIs. Please see the {ref}`sec_development_documentation` for details. The remaining files in the root directory of the project are for controlling {ref}`sec_development_continuous_integration` providers and other administrative purposes. Please see the {ref}`sec_development_best_practices` section for an overview of how to contribute a new feature to `tskit`. (sec_development_getting_started)= ## Getting started (sec_development_getting_started_requirements)= ### Requirements To develop the Python code you will need a working C compiler and a and some build utilities. Additionally, the doxygen package is required for building the C API documentation. On Debian/Ubuntu we can install these with: ```bash sudo apt install build-essential doxygen ``` On macOS, either `brew install doxygen` or `sudo port install doxygen` should get doxygen. You'll also need a "essential build" tools: a compiler (`gcc`) and a few other things (e.g., `make`). All Python development is managed using [uv](https://docs.astral.sh/uv/), which takes the place of virtual/conda environments. It is not strictly necessary to use uv in order to make small changes, but if you don't use it, you'll need to figure out how to install python dependencies on your own, and the development workflows of all tskit-dev packages are organised around using uv, and therefore we strongly recommend using it. Uv is straightforward to install, and not invasive (existing Python installations can be completely isolated if you don't use features like ``uv tool`` etc which update your ``$HOME/.local/bin``). Uv manages an isolated local environment per project and allows us to deterministically pin package versions and easily switch between Python versions, so that CI environments can be replicated exactly locally. The packages needed for development are specified as dependency groups in ``python/pyproject.toml`` and managed with [uv](https://docs.astral.sh/uv/). Install all development dependencies by running: ```bash cd python uv sync ``` Since `uv` operates from the `python/` subdirectory, **all `uv` commands below must be run from within that subdirectory**; otherwise errors like "No such file or directory" will occur. The lock file lives at `python/uv.lock` and must be kept up to date. Run `uv lock` after any change to the dependencies in `python/pyproject.toml`. A few extra dependencies are required if you wish to work on the {ref}`C library `. (sec_development_getting_started_environment)= ### Environment To get a local git development environment, please follow these steps: - Make a fork of the tskit repo on [GitHub](http://github.com/tskit-dev/tskit) - Clone your fork into a local directory: ```bash git clone git@github.com:YOUR_GITHUB_USERNAME/tskit.git ``` - Install the {ref}`sec_development_workflow_prek` pre-commit hook (again from the ``python/`` subdirectory): ```bash uv run prek install ``` See the {ref}`sec_development_workflow_git` section for detailed information on the recommended way to use git and GitHub. (sec_development_workflow)= ## Workflow (sec_development_workflow_git)= ### Git workflow If you would like to make an addition/fix to tskit, then follow the steps below to get things set up. If you would just like to review someone else's proposed changes (either to the code or to the docs), then skip to {ref}`sec_development_workflow_anothers_commit`. 0. Open an [issue](http://github.com/tskit-dev/tskit/issues) with your proposed functionality/fix. If adding or changing the public API close thought should be given to names and signatures of proposed functions. If consensus is reached that your proposed addition should be added to the codebase, proceed! 1. Make your own [fork](https://help.github.com/articles/fork-a-repo/) of the `tskit` repository on GitHub, and [clone](https://help.github.com/articles/cloning-a-repository/) a local copy as detailed in {ref}`sec_development_getting_started_environment`. 2. Make sure that your local repository has been configured with an [upstream remote]( https://help.github.com/articles/configuring-a-remote-for-a-fork/): ```bash git remote add upstream https://github.com/tskit-dev/tskit.git ``` 3. Create a "topic branch" to work on. One reliable way to do it is to follow this recipe: ```bash git fetch upstream git checkout -b topic_branch_name upstream/main ``` 4. Write your code following the outline in {ref}`sec_development_best_practices`. As you work on your topic branch you can add commits to it. Once you're ready to share this, you can then open a [pull request (PR)](https://help.github.com/articles/about-pull-requests/). This can be done at any time! You don't have to have code that is completely functional and tested to get feedback. Use the drop-down button to create a "draft PR" to indicate that it's not done, and explain in the comments what feedback you need and/or what you think needs to be done. 5. As you code it is best to [rebase](https://stdpopsim.readthedocs.io/en/latest/development.html#rebasing) your work onto the `main` branch periodically (e.g. once a week) to keep up with changes. If you merge `main` via `git pull upstream main` it will create a much more complex rebase when your code is finally ready to be incorporated into the main branch, so should be avoided. 6. Once you're done coding add content to the tutorial and other documentation pages if appropriate. 7. Update the change logs at `python/CHANGELOG.rst` and `c/CHANGELOG.rst`, taking care to document any breaking changes separately in a "breaking changes" section. 8. Push your changes to your topic branch and either open the PR or, if you already opened a draft PR change it to a non-draft PR by clicking "Ready to Review". 9. The tskit community will review the code, asking you to make changes where appropriate. This usually takes at least two rounds of review. 10. Once the review process is complete, squash the commits to the minimal set of changes - usually one or two commits. Please follow [this guide](https://stdpopsim.readthedocs.io/en/stable/development.html#rebasing) for step-by-step instructions on rebasing and squashing commits. 11. Your PR will be merged, time to celebrate! 🎉🍾 (sec_development_workflow_anothers_commit)= ### Checking out someone else's pull request Sometimes you want to just check out someone else's pull request, for the purpose of trying it out and giving them feedback. To do this, you first need your own local version of the git repository, so you should first do steps 1 and 2 above. (Strictly speaking, you don't need a fork on github if you don't plan to edit, but it won't hurt.) Continuing from there, let's say you want to check out the current state of the code on [pull request #854](https://github.com/tskit-dev/tskit/pull/854). (So, below you should replace `854` with the number of the pull request that you actually want to investigate.) Then, continuing from above: 3. Fetch the pull request, and store it as a local branch. For instance, to name the local branch `my_pr_copy`: ```bash git fetch upstream pull/854/head:my_pr_copy ``` You should probably call the branch something more descriptive, though. (Also note that you might need to put `origin` instead of `upstream` for the remote repository name: see `git remote -v` for a list of possible remotes.) 4. Check out the pull request's local branch: ```bash git checkout my_pr_copy ``` Now, your repository will be in exactly the same state as that of the person who's submitted the pull request. Great! Now you can test things out. To view the documentation, `cd docs && make`, which should build the documentation, and then navigate your web browser to the `docs/_build/html/` subdirectory. To test out changes to the *code*, you can change to the `python/` subdirectory, and run `make` to compile the C code. If you then execute python commands from this subdirectory (and only this one!), it will use the modified version of the package. (For instance, you might want to open an interactive python shell by running `uv run python` in the `python/` subdirectory, or running `uv run pytest` from this subdirectory.) After you're done, you should do: ```bash git checkout main ``` to get your repository back to the "main" branch of development. If the pull request is changed and you want to do the same thing again, then to avoid conflicts with any changes you might have made, first *delete* your local copy (by doing `git branch -d my_pr_copy`) and repeat the steps again. (sec_development_workflow_prek)= ### Lint checks (prek) On each commit a [prek](https://prek.j178.dev) hook will run checks for code style (see the {ref}`sec_development_python_style` section for details) and other common problems. To run checks manually without committing, from the `python/` subdirectory: ```bash uv run prek --all-files ``` If local results differ from CI, run `uv run prek cache clean` to clear the cache. To bypass the checks temporarily use `git commit --no-verify`. (sec_development_documentation)= ## Documentation The documentation for tskit is written using [Sphinx](http://www.sphinx-doc.org/en/stable/) and contained in the `docs` directory. Narrative pages are written in [MyST Markdown](https://jupyterbook.org/content/myst.html) and built with [JupyterBook](https://jupyterbook.org/), which executes embedded Python code cells and inserts their output before deployment. API docstrings are written in [reStructuredText](http://docutils.sourceforge.net/rst.html). For the C code, a combination of [Doxygen](http://www.doxygen.nl/) and [breathe](https://breathe.readthedocs.io/en/latest/) generates API documentation. The docs are deployed automatically to the [tskit.dev website](https://tskit.dev/). Please help us to improve the documentation! You can check on the list of [documentation issues](https://github.com/tskit-dev/tskit/issues?q=is%3Aissue+is%3Aopen+label%3Adocumentation) on GitHub, and help us fix any, or add issues for anything that's wrong or missing. ### Small edits If you see a typo or some other small problem that you'd like to fix, this is most easily done through the GitHub UI. Mouse over the GitHub icon at the top right of the page and click on the "Suggest edit" button. This will bring you to a web editor on GitHub for the source file in question, allowing you to quickly fix the typo and submit a pull request with the changes. Fix the typo, click the "Commit changes", add a commit message like "Fixed typo" and click on the green "Propose file change" button. Then follow the dialogues until you've created a new pull request with your changes, so that we can incorporate them. If the change you'd like to make is in the API documentation for a particular function, then you'll need to find where this function is defined first. The simplest way to do this is to click the green "[source]" link next to the function. This will show you a HTML rendered version of the function, and the rest of the file that it is in. You can then navigate to this file on GitHub, and edit it using the same approach as above. ### Significant edits When making changes more substantial than typo fixes it's best to check out a local copy. Follow the steps in the {ref}`sec_development_workflow_git` to get a fork of tskit, a local clone and newly checked out feature branch. Then follow the steps in the {ref}`sec_development_getting_started` section to get a working development environment. Once you are ready to make edits to the documentation, `cd` into the `docs` directory and run `make`. This should build the HTML documentation in `docs/_build/html/`, which you can then view in your browser. As you make changes, run `make` regularly and view the final result to see if it matches your expectations. Once you are happy with the changes, commit your updates and open a pull request on GitHub. (sec_development_documentation_markup)= ### Markup languages Because of the mixture of API documentation and notebook content, documentation is written using **two different markup languages**: - **MyST Markdown** for all narrative pages, thematic sections, and code examples. This is a superset of [CommonMark](https://commonmark.org) that enables executable Jupyter content and Sphinx cross-referencing. - **reStructuredText (rST)** for API docstrings embedded in the source code. These are processed by Sphinx and appear in the API reference pages. Some useful links for MyST: - The [MyST cheat sheet](https://jupyterbook.org/reference/cheatsheet.html) - The "Write Book Content" section of the [Jupyter Book](https://jupyterbook.org/) docs - The [MyST Syntax Guide](https://myst-parser.readthedocs.io/en/latest/using/syntax.html) - The [Sphinx domains reference](https://www.sphinx-doc.org/en/master/usage/restructuredtext/domains.html) for marking up Python and C API elements - The [types of source files](https://jupyterbook.org/file-types/index.html) in the Jupyter Book docs (useful for understanding the MyST/rST mix) Some directives are only available in rST and must be wrapped in an ``eval-rst`` block within a Markdown file: ````md ```{eval-rst} .. autoclass:: tskit.TreeSequence ``` ```` (sec_development_documentation_api)= ### API Reference API reference documentation comes from [docstrings](https://www.python.org/dev/peps/pep-0257/) in the source code, written in rST. Docstrings should be **concise** and **precise**. Examples should not be embedded directly in docstrings; instead, each significant parameter should link to the relevant section in the narrative documentation. (sec_development_documentation_examples)= ### Examples Narrative sections should provide context and worked examples using inline Jupyter code cells. These behave exactly like cells in a Jupyter notebook — the whole page is executed as one notebook during the build. Code cells are written like this: ````md ```{code-cell} import tskit # example code here ``` ```` :::{warning} For a page to be executed as a notebook you **must** have the correct [YAML frontmatter](https://jupyterbook.org/reference/cheatsheet.html#executable-code) at the top of the file. ::: (sec_development_documentation_cross_referencing)= ### Cross referencing Use the ``{ref}`` role to link to labelled sections within the docs: ````md See the {ref}`sec_development_documentation_cross_referencing` section for details. ```` Sections should be labelled hierarchically immediately above the heading: ````md (sec_development_documentation_cross_referencing)= ### Cross referencing ```` The label is used as link text automatically, but can be overridden: ````md See {ref}`this section ` for more. ```` To refer to API elements, use the appropriate inline role: ````md The {class}`.TreeSequence` class, the {meth}`.TreeSequence.trees` method, and the {func}`.load` function. ```` From an rST docstring, use the colon-prefixed equivalents: ````rst See :ref:`sec_development_documentation_cross_referencing` for details. The :meth:`.TreeSequence.trees` method returns an iterator. ```` Some errors may occur because of out-of-date cached results, which can be cleared by running `make clean`. (sec_development_python)= ## Python library The Python library is defined in the `python` directory. We assume throughout this section that you have `cd`'d into this directory. The low-level C extension is built automatically as part of `uv sync`. Please see the {ref}`sec_development_python_troubleshooting` section for help if you encounter problems with compiling or running the tests. ### Getting started After you have installed the basic {ref}`sec_development_getting_started_requirements` and created a {ref}`development environment `, run `uv sync` at the repo root. This will install all dependencies and build the low-level {ref}`sec_development_python_c` module automatically. To make sure that your development environment is working, run some {ref}`tests `. ### Layout Code for the `tskit` module is in the `tskit` directory. The code is split into a number of modules that are roughly split by function; for example, code for visualisation is kept in the `tskit/drawing.py`. Test code is contained in the `tests` directory. Tests are also roughly split by function, so that tests for the `drawing` module are in the `tests/test_drawing.py` file. This is not a one-to-one mapping, though. Development dependencies are specified in the `pyproject.toml` file and can be installed using `uv sync`. (sec_development_python_style)= ### Code style Python code in tskit is formatted and linted using [ruff](https://docs.astral.sh/ruff/). These checks run automatically as part of the {ref}`prek checks ` on each commit. Ruff is quite opinionated and it gains more opinions on each version. We therefore pin ruff to an exact version and maintain a list of "ignore" classes in pyproject.toml. The version of ruff should be updated periodically with fixes applied or the the list ignore extended as necessary. (sec_development_python_tests)= ### Tests The tests are defined in the `tests` directory, and run using [pytest](https://docs.pytest.org/en/stable/) from the `python` directory. If you want to run the tests in a particular module (say, `test_tables.py`), use: ```bash uv run pytest tests/test_tables.py ``` To run all the tests in a particular class in this module (say, `TestNodeTable`) use: ```bash uv run pytest tests/test_tables.py::TestNodeTable ``` To run a specific test case in this class (say, `test_copy`) use: ```bash uv run pytest tests/test_tables.py::TestNodeTable::test_copy ``` In general, you can copy-paste the string describing a failed test from the output of pytest to re-run just that test (including specific parametrized arguments present as `[args]`). You can also run tests with a keyword expression search. For example this will run all tests that have `TestNodeTable` but not `copy` in their name: ```bash uv run pytest -k "TestNodeTable and not copy" ``` When developing your own tests, it is much quicker to run the specific tests that you are developing rather than rerunning large sections of the test suite each time. To run all of the tests, we can use: ```bash uv run pytest ``` By default the tests are run on 4 cores, if you have more you can specify: ```bash uv run pytest -n8 ``` A few of the tests take most of the time, we can skip the slow tests to get the test run under 20 seconds on an modern workstation: ```bash uv run pytest --skip-slow ``` If you have an agent running the tests in a sandboxed environment, you may need to skip tests thsat require network access or FIFOs: ```bash uv run pytest --skip-network ``` If you have a lot of failing tests it can be useful to have a shorter summary of the failing lines: ```bash uv run pytest --tb=line ``` If you need to see the output of tests (e.g. `print` statements) then you need to use these flags to run a single thread and capture output: ```bash uv run pytest -n0 -vs ``` All new code must have high test coverage, which will be checked as part of the {ref}`sec_development_continuous_integration` tests by [CodeCov](https://codecov.io/gh/tskit-dev/tskit/). All tests must pass for a PR to be accepted. ### Packaging The `tskit` Python module follows the current [best-practices](http://packaging.python.org) advocated by the [Python Packaging Authority](http://pypa.io/en/latest/). The primary means of distribution is though [PyPI](http://pypi.python.org/pypi/tskit), which provides the canonical source for each release. A package for [conda](http://conda.io/docs/) is also available on [conda-forge](https://github.com/conda-forge/tskit-feedstock). ### Interfacing with low-level module Much of the high-level Python code only exists to provide a simpler interface to the low-level {ref}`_tskit ` module. As such, many objects (e.g. {class}`.Tree`) are really just a shallow layer on top of the corresponding low-level object. The usual convention here is to keep a reference to the low-level object via a private instance variable such as `self._ll_tree`. ### Command line interface The command line interface for `tskit` is defined in the `tskit/cli.py` file. The entry point `tskit_main` is declared under `[project.scripts]` in `python/pyproject.toml`, which makes `tskit` available as a command after installation. The CLI can also be run using `uv run python -m tskit` during development. (sec_development_installing)= ### Installing development versions We **strongly** recommend that you do not install development versions of `tskit` and instead use versions released to PyPI and conda-forge. However, if you really need to be on the bleeding edge, you can use the following command to install: ```bash python3 -m pip install git+https://github.com/tskit-dev/tskit.git#subdirectory=python ``` (Because the Python package is not defined in the project root directory, using pip to install directly from GitHub requires you to specify `subdirectory=python`.) (sec_development_python_troubleshooting)= ### Troubleshooting - If `make` is giving you strange errors, or if tests are failing for strange reasons, try running `make clean` in the project root and then rebuilding. - Beware of multiple versions of the python library installed by different programs (e.g., pip versus installing locally from source)! In python, `tskit.__file__` will tell you the location of the package that is being used. - Installation of development version is not supported in Windows. Windows users should try using a Linux envronment by using [WSL](https://learn.microsoft.com/windows/wsl/), for example. (sec_development_c)= ## C Library The Python module uses the high-performance tskit {ref}`sec_c_api` behind the scenes. All C code and associated development infrastructure is held in the `c` directory. (sec_development_c_requirements)= ### Requirements We use the [meson](https://mesonbuild.com) build system in conjunction with [ninja-build](https://ninja-build.org) to compile the C code. Unit tests use the [CUnit](http://cunit.sourceforge.net) library and we use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to automatically format code. On Debian/Ubuntu, install the system dependencies with: ```bash sudo apt install libcunit1-dev ninja-build ``` On macOS, you can run `brew install cunit ninja` or `sudo port install cunit ninja`. You can install meson using uv: ```bash uv tool install meson ``` An exact version of clang-format is required because formatting rules change from version to version. This is why we pin to an exact version of clang-format in pyproject.toml, which gets used by prek linting. If you wish to run clang-format yourself (e.g., within your editor) a straightforward way to do this is to use ``uv tool install clang-format==[version]``, which will install to your PATH. However, you will need to manually keep track of what version is installed (``uv tool list`` is useful for this). (sec_development_c_code_style)= ### Code style C code is formatted using [clang-format](https://clang.llvm.org/docs/ClangFormat.html) with a custom configuration. This is checked as part of the {ref}`prek checks `. To manually format all files run: ```bash uv run prek --all-files ``` If you are doing this in the ``c`` directory, use ``uv run --project=../python prek --all-files``. If you are getting obscure errors from prek, sometimes this is caused by prek searching for configuration within subdirectories. To avoid this, tell prek where to find its config explicitly: ```bash uv run prek --all-files -c prek.toml ``` ### Building We use [meson](https://mesonbuild.com) and [ninja-build](https://ninja-build.org) to compile the C code. Meson keeps all compiled binaries in a build directory (this has many advantages such as allowing multiple builds with different options to coexist). The build configuration is defined in `meson.build`. To set up the initial build directory, run ```bash cd c meson setup build ``` To setup a debug build add `--buildtype=debug` to the above command. (Re-running the command with this argument will have the desired effect.) This will set the `TSK_TRACE_ERRORS` flag, which will print error messages to `stderr` when errors occur which is useful for debugging. To compile the code run ```bash ninja -C build ``` All the tests and other artefacts are in the build directory. Individual test suites can be run, via (e.g.) `./build/test_trees`. To run all of the tests, run ```bash ninja -C build test ``` For vim users, the [mesonic](https://www.vim.org/scripts/script.php?script_id=5378) plugin simplifies this process and allows code to be compiled seamlessly within the editor. ### Compile flags If the flag `TSK_TRACE_ERRORS` is defined (by e.g. `-DTSK_TRACE_ERRORS` to gcc), then error messages will be printed to `stderr` when errors occur. This also allows breakpoints to be set in the `_tsk_trace_error` function to break on all errors. ### Unit Tests The C-library has an extensive suite of unit tests written using [CUnit](http://cunit.sourceforge.net). These tests aim to establish that the low-level APIs work correctly over a variety of inputs, and particularly, that the tests don't result in leaked memory or illegal memory accesses. All tests are run under valgrind to make sure of this as part of the {ref}`sec_development_continuous_integration`. Tests are defined in the `tests/*.c` files. These are roughly split by the source files, so that the tests for functionality in the `tskit/tables.c` file will be tested in `tests/test_tables.c`. To run all the tests in the `test_tables` suite, run (e.g.) `./build/test_tables`. To just run a specific test on its own, provide this test name as a command line argument, e.g.: ```bash ./build/test_tables test_node_table ``` After making sure tests pass, you should next run the tests through valgrind, to check for memory leaks, for instance: ```bash valgrind ./build/test_tables test_node_table ``` While 100% test coverage is not feasible for C code, we aim to cover all code that can be reached. (Some classes of error such as malloc failures and IO errors are difficult to simulate in C.) Code coverage statistics are automatically tracked using [CodeCov](https://codecov.io/gh/tskit-dev/tskit/). ### Viewing coverage reports To generate and view coverage reports for the C tests locally: Compile with coverage enabled: ```bash cd c meson setup build -D b_coverage=true ninja -C build ``` Run the tests: ```bash ninja -C build test ``` Generate coverage data: ```bash cd build find ../tskit/*.c -type f -printf "%f\n" | xargs -i gcov -pb libtskit.a.p/tskit_{}.gcno ../tskit/{} ``` The generated `.gcov` files can then be viewed directly with `cat filename.c.gcov`. Lines prefixed with `#####` were never executed, lines with numbers show execution counts, and lines with `-` are non-executable code. `lcov` can be used to create browsable HTML coverage reports: ```bash sudo apt-get install lcov # if needed lcov --capture --directory build --output-file coverage.info genhtml coverage.info --output-directory coverage_html firefox coverage_html/index.html ``` ### Coding conventions The code is written using the [C99](https://en.wikipedia.org/wiki/C99) standard. All variable declarations should be done at the start of a function, and functions kept short and simple where at all possible. No global or module level variables are used for production code. Function parameters should be marked as ``const`` where possible. Parameters that are used as return variables should come last. The common ``options`` parameter should be the last non-output parameter. Please see the {ref}`sec_c_api_overview_structure` section for more information about how the API is structured. ### Error handling A critical element of producing reliable C programs is consistent error handling and checking of return values. All return values **must** be checked! In tskit, all functions (except the most trivial accessors) return an integer to indicate success or failure. Any negative value is an error, and must be handled accordingly. The following pattern is canonical: ```C ret = tsk_tree_do_something(self, argument); if (ret != 0) { goto out; } // rest of function out: return ret; ``` Here we test the return value of `tsk_tree_do_something` and if it is non-zero, abort the function and return this same value from the current function. This is a bit like throwing an exception in higher-level languages, but discipline is required to ensure that the error codes are propagated back to the original caller correctly. Particular care must be taken in functions that allocate memory, because we must ensure that this memory is freed in all possible success and failure scenarios. The following pattern is used throughout for this purpose: ```C double *x = NULL; x = malloc(n * sizeof(double)); if (x == NULL) { ret = tsk_trace_error(TSK_ERR_NO_MEMORY); goto out; } // rest of function out: tsk_safe_free(x); return ret; ``` It is vital here that `x` is initialised to `NULL` so that we are guaranteed correct behaviour in all cases. For this reason, the convention is to declare all pointer variables on a single line and to initialise them to `NULL` as part of the declaration. Error codes are defined in `core.h`, and these can be translated into a message using `tsk_strerror(err)`. When setting error codes in the C code, please use the `tsk_trace_error` function. If `TSK_TRACE_ERRORS` is defined, this will print a message to stderr with the details of the error. #### Using assertions There are two different ways to express assertions in tskit code. The first is using the custom `tsk_bug_assert` macro, which is used to make inexpensive checks at key points during execution. These assertions are always run, regardless of the compiler settings, and should not contribute significantly to the overall runtime. More expensive assertions, used, for example, to check pre and post conditions on performance critical loops should be expressed using the standard `assert` macro from `assert.h`. These assertions will be checked during the execution of C unit tests, but will not be enabled when compiled into the Python C module. ### Type conventions - `tsk_id_t` is an ID for any entity in a table. - `tsk_size_t` refers to any size or count values in tskit. - `size_t` is a standard C type and refers to the size of a memory block. This should only be used when computing memory block sizes for functions like `malloc` or passing the size of a memory buffer as a parameter. - Error indicators (the return type of most functions) are `int`. - `uint32_t` etc should be avoided (any that exist are a leftover from older code that didn't use `tsk_size_t` etc.) - `int64_t` and `uint64_t` are sometimes useful when working with bitstrings (e.g. to implement a set). (sec_development_python_c)= ## Python C Interface ### Overview The Python C interface is defined in the `python` directory and written using the [Python C API](https://docs.python.org/3.6/c-api/). The source code for this interface is in the `_tskitmodule.c` file. When compiled, this produces the `_tskit` module, which is imported by the high-level Python code. The low-level Python module is not intended to be used directly by users and may change arbitrarily over time. The usual pattern in the low-level Python API is to define a Python class which corresponds to a given "class" in the C API. For example, we define a `TreeSequence` class, which is essentially a thin wrapper around the `tsk_tree_t` type from the C library. The `_tskitmodule.c` file follows the standard conventions given in the [Python documentation](https://docs.python.org/3.6/extending/index.html). ### Compiling and debugging The `setup.py` file describes the requirements for the low-level `_tskit` module and how it is built from source. The module is built automatically by `uv sync`, but if you modify the C extension code you will need to rebuild it. The simplest way to do this is to run `make` in the `python` directory: ```bash make ``` If `make` is not available, you can run the same command manually: ```bash uv run python setup.py build_ext --inplace ``` It is sometimes useful to specify compiler flags when building the low level module. For example, to make a debug build you can use: ```bash CFLAGS='-Wall -O0 -g' make ``` If you need to track down a segfault etc, running some code through gdb can be very useful. For example, to run a particular test case, we can do: ```bash uv run gdb python (gdb) run -m pytest -vs tests/test_tables.py::TestNodeTable::test_copy Starting program: /usr/bin/python3 run -m pytest tests/test_tables.py::TestNodeTable::test_copy [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1". [New Thread 0x7ffff1e48700 (LWP 1503)] [New Thread 0x7fffef647700 (LWP 1504)] [New Thread 0x7fffeee46700 (LWP 1505)] [Thread 0x7fffeee46700 (LWP 1505) exited] [Thread 0x7fffef647700 (LWP 1504) exited] [Thread 0x7ffff1e48700 (LWP 1503) exited] collected 1 item tests/test_tables.py::TestNodeTable::test_copy PASSED [Inferior 1 (process 1499) exited normally] (gdb) ``` Tracing problems in C code is many times more difficult when the Python C API is involved because of the complexity of Python's memory management. It is nearly always best to start by making sure that the tskit C API part of your addition is thoroughly tested with valgrind before resorting to the debugger. ### Testing for memory leaks The Python C API can be subtle, and it is easy to get the reference counting wrong. The `stress_lowlevel.py` script makes it easier to track down memory leaks when they do occur. The script runs the unit tests in a loop, and outputs memory usage statistics. (sec_development_continuous_integration)= ## Continuous Integration tests Continuous integration is handled by [GitHub Actions](https://help.github.com/en/actions). tskit uses shared workflows defined in the [tskit-dev/.github](https://github.com/tskit-dev/.github) repository: - **lint** — runs ruff and clang (using prek) against all files - **python-tests** — runs the pytest suite with coverage on Linux, macOS and Windows - **python-c-tests** — builds the C extension with coverage and runs low-level tests - **c-tests** — runs C unit tests under gcc, clang, and valgrind - **docs** — builds the documentation and deploys it on merge to `main` - **python-packaging** — validates the sdist and wheel [CodeCov](https://codecov.io/gh) tracks test coverage for Python and C. (sec_development_best_practices)= ## Best Practices for Development The following is a rough guide of best practices for contributing a function to the tskit codebase. Note that this guide covers the most complex case of adding a new function to both the C and Python APIs. 0. Draft a docstring for your function, that describes exactly what the function takes as arguments and what it returns under what conditions. Update this docstring as you go along and make modifications. 1. Write your function in Python: in `python/tests/` find the test module that pertains to the functionality you wish to add. For instance, the kc_distance metric was added to [test_topology.py](https://github.com/tskit-dev/tskit/blob/main/python/tests/test_topology.py). Add a python version of your function here. 2. Create a new class in this module to write unit tests for your function: in addition to making sure that your function is correct, make sure it fails on inappropriate inputs. This can often require judgement. For instance, {meth}`Tree.kc_distance` fails on a tree with multiple roots, but allows users to input parameter values that are nonsensical, as long as they don't break functionality. See the [TestKCMetric](https://github.com/tskit-dev/tskit/blob/4e707ea04adca256036669cd852656a08ec45590/python/tests/test_topology.py#L293) for example. 3. Write your function in C: check out the {ref}`sec_c_api` for guidance. There are also many examples in the [c directory](https://github.com/tskit-dev/tskit/tree/main/c/tskit). Your function will probably go in [trees.c](https://github.com/tskit-dev/tskit/blob/main/c/tskit/trees.c). 4. Write a few tests for your function in C: again, write your tests in [tskit/c/tests/test_tree.c](https://github.com/tskit-dev/tskit/blob/main/c/tests/test_trees.c). The key here is code coverage, you don't need to worry as much about covering every corner case, as we will proceed to link this function to the Python tests you wrote earlier. 5. Create a low-level definition of your function using Python's C API: this will go in [_tskitmodule.c](https://github.com/tskit-dev/tskit/blob/main/python/_tskitmodule.c). 6. Test your low-level implementation in [tskit/python/tests/test_python_c.py ](https://github.com/tskit-dev/tskit/blob/main/python/tests/test_python_c.py): again, these tests don't need to be as comprehensive as your first python tests, instead, they should focus on the interface, e.g., does the function behave correctly on malformed inputs? 7. Link your C function to the Python API: write a function in tskit's Python API, for example the kc_distance function lives in [tskit/python/tskit/trees.py](https://github.com/tskit-dev/tskit/blob/main/python/tskit/trees.py). 8. Modify your Python tests to test the new C-linked function: if you followed the example of other tests, you might need to only add a single line of code here. In this case, the tests are well factored so that we can easily compare the results from both the Python and C versions. 9. Finalize your docstring and insert it into the Python API: for instance, the kc_distance docstring is in [tskit/python/tskit/trees.py](https://github.com/tskit-dev/tskit/blob/main/python/tskit/trees.py). Ensure that your docstring renders correctly by building the documentation (see {ref}`sec_development_documentation`). ## Troubleshooting ### prek is blocking me! The prek hook is designed to make things easier, not harder. If the checks are blocking you, feel free to skip them with `--no-verify` and sort it out before the PR is merged. There’s no shame in a broken build. ```bash > git commit -a -m ‘my changes’ --no-verify ``` ### prek reports unexpected failures If prek reports failures on files you didn’t edit, try clearing the cache: ```bash > uv run prek cache clean ``` If that doesn’t help, you can reinstall the hook: ```bash > uv run prek uninstall > uv run prek install ``` ## Benchmarking Tskit has a simple benchmarking tool to help keep track of performance. ### Running benchmarks The benchmark suite can be run with: ```bash > cd python/benchmark > python run.py ``` A subset of benchmarks can be run by specifying a string. For example, the following command runs all the benchmarks whose names contain "genotype", e.g. "genotype_matrix". ```bash > python run.py -k genotype ``` If desired, the results of the benchmarks can be printed to STDOUT. ```bash > python run.py -k genotype -p ``` Results are written to `bench-results.json` in the same folder. Note that if any version of `tskit` is installed then that will be used for the benchmarking. To use the local development version of tskit ensure you have `pip uninstall tskit` before running the benchmarking. The version used is shown in the header of the report. ### Adding a new benchmark The benchmarks are specified by the `config.yaml` file in `python/benchmark`. To add a new benchmark add an entry to the `benchmarks` dictionary. For example: ```yaml - code: do_my_thing({option_name}) setup: | import a_module name: my_benchmark #optional, the code is used by default parameters: option_name: - "reticulate_splines" - "foobar" ``` Strings are interpreted as Python f-strings, so you can use the `parameters` dictionary to provide values that will be interpolated into both the `setup` and `code` strings. The suite can be run for all released versions with the `run-for-all-releases.py` script. ## Releasing a new version See the [repo administration guide](https://github.com/tskit-dev/.github/blob/main/repo_administration.md) for the release process. Tskit has both a C API release and a Python package release, each covered in the tskit/kastore section of that document. It is worth running the benchmarks (see above) before a Python release to check for any unexpected major regressions. For a major release the website (github repo tskit-dev/tskit-site) should be updated with a notebook of new features and the `bench-results.html` updated. ================================================ FILE: docs/doxygen/Doxyfile ================================================ # Doxyfile 1.9.1 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the configuration # file that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = tskit # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "The tree sequence toolkit" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all generated output in the proper direction. # Possible values are: None, LTR, RTL and Context. # The default value is: None. OUTPUT_TEXT_DIRECTION = None # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line # such as # /*************** # as being the beginning of a Javadoc-style comment "banner". If set to NO, the # Javadoc-style will behave just like regular comments and it will not be # interpreted by doxygen. # The default value is: NO. JAVADOC_BANNER = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # By default Python docstrings are displayed as preformatted text and doxygen's # special commands cannot be used. By setting PYTHON_DOCSTRING to NO the # doxygen's special commands can be used and the contents of the docstring # documentation blocks is shown as doxygen documentation. # The default value is: YES. PYTHON_DOCSTRING = YES # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines (in the resulting output). You can put ^^ in the value part of an # alias to insert a newline as if a physical newline was in the original file. # When you need a literal { or } or , in the value part of an alias you have to # escape them by means of a backslash (\), this can lead to conflicts with the # commands \{ and \} for these it is advised to use the version @{ and @} or use # a double escape (\\{ and \\}) ALIASES = "rst=\verbatim embed:rst" \ endrst=\endverbatim # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice # sources only. Doxygen will then generate output that is more tailored for that # language. For instance, namespaces will be presented as modules, types will be # separated into more groups, etc. # The default value is: NO. OPTIMIZE_OUTPUT_SLICE = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, JavaScript, # Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL, # Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: # FortranFree, unknown formatted Fortran: Fortran. In the later case the parser # tries to guess whether the code is fixed or free formatted code, this is the # default for Fortran type files). For instance to make doxygen treat .inc files # as Fortran files (default is PHP), and .f files as C (default is Fortran), # use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. When specifying no_extension you should add # * to the FILE_PATTERNS. # # Note see also the list of default file extension mappings. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See https://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up # to that level are automatically included in the table of contents, even if # they do not have an id attribute. # Note: This feature currently applies only to Markdown headings. # Minimum value: 0, maximum value: 99, default value: 5. # This tag requires that the tag MARKDOWN_SUPPORT is set to YES. TOC_INCLUDE_HEADINGS = 0 # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 # The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use # during processing. When set to 0 doxygen will based this on the number of # cores available in the system. You can set it explicitly to a value larger # than 0 to get more control over the balance between CPU load and processing # speed. At this moment only the input processing can be done using multiple # threads. Since this is still an experimental feature the default is set to 1, # which efficively disables parallel processing. Please report any issues you # encounter. Generating dot graphs in parallel is controlled by the # DOT_NUM_THREADS setting. # Minimum value: 0, maximum value: 32, default value: 1. NUM_PROC_THREADS = 1 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual # methods of a class will be included in the documentation. # The default value is: NO. EXTRACT_PRIV_VIRTUAL = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If this flag is set to YES, the name of an unnamed parameter in a declaration # will be determined by the corresponding definition. By default unnamed # parameters remain unnamed in the output. # The default value is: YES. RESOLVE_UNNAMED_PARAMS = YES # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # declarations. If set to NO, these declarations will be included in the # documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # With the correct setting of option CASE_SENSE_NAMES doxygen will better be # able to match the capabilities of the underlying filesystem. In case the # filesystem is case sensitive (i.e. it supports files in the same directory # whose names only differ in casing), the option must be set to YES to properly # deal with such files in case they appear in the input. For filesystems that # are not case sensitive the option should be be set to NO to properly deal with # output files written for symbols that only differ in casing, such as for two # classes, one named CLASS and the other named Class, and to also support # references to files without having to specify the exact matching casing. On # Windows (including Cygwin) and MacOS, users should typically set this option # to NO, whereas on Linux or other Unix flavors it should typically be set to # YES. # The default value is: system dependent. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = YES # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = NO # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. If # EXTRACT_ALL is set to YES then this flag will automatically be disabled. # The default value is: NO. WARN_NO_PARAMDOC = YES # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS # then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but # at the end of the doxygen process doxygen will return with a non-zero status. # Possible values are: NO, YES and FAIL_ON_WARNINGS. # The default value is: NO. WARN_AS_ERROR = YES # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = ../../c/tskit # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: # https://www.gnu.org/software/libiconv/) for the list of possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # Note the list of default checked file patterns might differ from the list of # default file extension mappings. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), # *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl, # *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.h # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # entity all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: # http://clang.llvm.org/) for more accurate parsing at the cost of reduced # performance. This can be particularly helpful with template rich C++ code for # which doxygen's built-in parser lacks the necessary type information. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse_libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled and the CLANG_ADD_INC_PATHS tag is set to # YES then doxygen will add the directory of each input to the include path. # The default value is: YES. CLANG_ADD_INC_PATHS = YES # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = # If clang assisted parsing is enabled you can provide the clang parser with the # path to the directory containing a file called compile_commands.json. This # file is the compilation database (see: # http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the # options used when the source files were built. This is equivalent to # specifying the -p option to a clang tool, such as clang-check. These options # will then be passed to the parser. Any options specified with CLANG_OPTIONS # will be added as well. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse_libclang=ON option for CMake. CLANG_DATABASE_PATH = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = NO # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML # documentation will contain a main index with vertical navigation menus that # are dynamically created via JavaScript. If disabled, the navigation index will # consists of multiple levels of tabs that are statically embedded in every HTML # page. Disable this option to support browsers that do not have JavaScript, # like the Qt help browser. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_MENUS = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: # https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To # create a documentation set, doxygen will generate a Makefile in the HTML # output directory. Running make will produce the docset in that directory and # running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy # genXcode/_index.html for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: # https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the main .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location (absolute path # including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to # run qhelpgenerator on the generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg # tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see # https://inkscape.org) to generate formulas as SVG images instead of PNGs for # the HTML output. These images will generally look nicer at scaled resolutions. # Possible values are: png (the default) and svg (looks nicer but requires the # pdf2svg or inkscape tool). # The default value is: png. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FORMULA_FORMAT = png # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANSPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands # to create new LaTeX commands to be used in formulas as building blocks. See # the section "Including formulas" for details. FORMULA_MACROFILE = # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # https://www.mathjax.org) which uses client side JavaScript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from https://www.mathjax.org before deployment. # The default value is: https://cdn.jsdelivr.net/npm/mathjax@2. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: # http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /